library(plyr)
library(tidyverse)
library(DESeq2)
library(dplyr)
library(ggplot2)
counts_2ms01e <- read.table(
"final_counts_2ms01e.txt", sep="\t", header = TRUE)
counts_2ms02g <- read.table(
"final_counts_2ms02g.txt", sep="\t", header = TRUE)
counts_2ms03g <- read.table(
"final_counts_2ms03g.txt", sep="\t", header = TRUE)
counts_2ms04h <- read.table(
"final_counts_2ms04h.txt", sep="\t", header = TRUE)
counts_2ms01e.DeDup <- counts_2ms01e %>%
distinct(start, transcript_ID, .keep_all = TRUE)
counts_2ms02g.DeDup <- counts_2ms02g %>%
distinct(start, transcript_ID, .keep_all = TRUE)
counts_2ms03g.DeDup <- counts_2ms03g %>%
distinct(start, transcript_ID, .keep_all = TRUE)
counts_2ms04h.DeDup <- counts_2ms04h %>%
distinct(start, transcript_ID, .keep_all = TRUE)
nrow(counts_2ms01e.DeDup)
## [1] 3017
nrow(counts_2ms02g.DeDup)
## [1] 3017
nrow(counts_2ms03g.DeDup)
## [1] 3017
nrow(counts_2ms04h.DeDup)
## [1] 3017
I. select required columns II. rename columns - to preserve column-sample relationship
contig, start, end, geneName, geneID, transcriptID, unqC_My, unqC_Sp, mulC_My, mulC_Sp, totalC_My, totalC_Sp
counts_2ms01e.selected <- select(
counts_2ms01e.DeDup, contig,
start, end, gene_Name, gene_ID,
transcript_ID,
matches("unqC|mulC|totalC", ignore.case = TRUE))
counts_2ms02g.selected <- select(
counts_2ms02g.DeDup, contig,
start, transcript_ID,
matches("unqC|mulC|totalC", ignore.case = TRUE))
counts_2ms03g.selected <- select(
counts_2ms03g.DeDup, contig,
start, transcript_ID,
matches("unqC|mulC|totalC", ignore.case = TRUE))
counts_2ms04h.selected <- select(
counts_2ms04h.DeDup, contig,
start, transcript_ID,
matches("unqC|mulC|totalC", ignore.case = TRUE))
counts_2ms01e.Renamed <- plyr::rename(
counts_2ms01e.selected,
c(unqC_My = "unqC_My.ms01e", unqC_Sp = "unqC_Sp.ms01e",
mulC_My = "mulC_My.ms01e", mulC_Sp = "mulC_Sp.ms01e",
totalC_My = "totalC_My.ms01e", totalC_Sp = "totalC_Sp.ms01e"))
counts_2ms02g.Renamed <- plyr::rename(
counts_2ms02g.selected,
c(unqC_My = "unqC_My.ms02g", unqC_Sp = "unqC_Sp.ms02g",
mulC_My = "mulC_My.ms02g", mulC_Sp = "mulC_Sp.ms02g",
totalC_My = "totalC_My.ms02g", totalC_Sp = "totalC_Sp.ms02g"))
counts_2ms03g.Renamed <- plyr::rename(
counts_2ms03g.selected,
c(unqC_My = "unqC_My.ms03g", unqC_Sp = "unqC_Sp.ms03g",
mulC_My = "mulC_My.ms03g", mulC_Sp = "mulC_Sp.ms03g",
totalC_My = "totalC_My.ms03g", totalC_Sp = "totalC_Sp.ms03g"))
counts_2ms04h.Renamed <- plyr::rename(
counts_2ms04h.selected,
c(unqC_My = "unqC_My.ms04h", unqC_Sp = "unqC_Sp.ms04h",
mulC_My = "mulC_My.ms04h", mulC_Sp = "mulC_Sp.ms04h",
totalC_My = "totalC_My.ms04h", totalC_Sp = "totalC_Sp.ms04h"))
counts.merged.ms1e2g3g4h <- merge(
counts_2ms01e.Renamed, counts_2ms02g.Renamed,
by=c("contig", "start", "transcript_ID")) %>%
merge(counts_2ms03g.Renamed,
by=c("contig", "start", "transcript_ID")) %>%
merge(counts_2ms04h.Renamed,
by=c("contig", "start", "transcript_ID"))
# Note: This will merge the dataframes by matching the names of the contig, start, trans_ID
Take the merged data and select only genes/transcripts ..
I. ..with sufficient coverage (above 10 unique counts in at least one haplotype of 4 samples i.e in one unqC columns out of 8 cols)
counts.ms1e2g3g4h.Abv10Counts <- filter(
counts.merged.ms1e2g3g4h,
unqC_My.ms01e>10 |unqC_Sp.ms01e>10 |unqC_My.ms02g>10 |unqC_Sp.ms02g>10
|unqC_My.ms03g>10 |unqC_Sp.ms03g>10 |unqC_My.ms04h>10 |unqC_Sp.ms04h>10)
# First sum the unqC, mulC and totalC
counts.ms1e2g3g4h.Summed <- cbind(
counts.ms1e2g3g4h.Abv10Counts,
unqC_total = counts.ms1e2g3g4h.Abv10Counts %>%
select(matches("unqC")) %>%
rowSums(),
mulC_total =
counts.ms1e2g3g4h.Abv10Counts %>%
select(matches("mulC")) %>%
rowSums(),
totalC_total = counts.ms1e2g3g4h.Abv10Counts %>%
select(matches("totalC")) %>%
rowSums())
# Now, Select rows where the mulC reads < 20% of totalReads or unqC Reads > 20% totalReads
counts.ms1e2g3g4h.filt01 <- filter(
counts.ms1e2g3g4h.Summed,
mulC_total < .20*totalC_total |
unqC_total > .20*totalC_total) %>%
# remove the totaled column
select(-unqC_total, -mulC_total, -totalC_total) %>%
# then sort by contig and start position
arrange(start, contig)
I). Make a vertical dataframe with unqC reads for My and Sp alleles as separate columns and all of the samples together II). Apply additional filter (unqC_Sum > 16)
counts.vert.filt01 <- counts.ms1e2g3g4h.filt01 %>%
select(c(gene_ID, transcript_ID, starts_with("unqc"))) %>%
pivot_longer(-c(gene_ID, transcript_ID)) %>%
mutate(sample = paste0("2",str_sub(name, -5, -1)),
allele = str_sub(name, 1, 7)) %>%
select(-name) %>%
pivot_wider(names_from = allele, values_from = value)
To avoid situations with (0,0) counts and leave some significantly
counts.ForBinom <- counts.vert.filt01 %>%
mutate(unqC_Sum = unqC_My + unqC_Sp) %>%
filter(unqC_Sum >= 16)
Import the expression data and convert it into a matrix
# matrix data only has counts with one rowname column
# > the column "transcript_ID" can be set as rownames
# > select only unique counts (unqC) columns for data analyses
expressionData <- data.frame(
counts.ms1e2g3g4h.filt01, row.names = "transcript_ID") %>%
select(matches("unqC", ignore.case = TRUE))
head(expressionData)
## unqC_My.ms01e unqC_Sp.ms01e unqC_My.ms02g unqC_Sp.ms02g
## AL2G10020.t1 1414 3590 2 724
## AL2G10030.t1 1470 940 562 476
## AL2G10050.t1 212 390 30 0
## AL2G10060.t1 70 70 8 4
## AL2G10070.t1 26 42 0 0
## AL2G10080.t1 64 80 42 32
## unqC_My.ms03g unqC_Sp.ms03g unqC_My.ms04h unqC_Sp.ms04h
## AL2G10020.t1 752 2764 1806 2934
## AL2G10030.t1 1856 948 1538 928
## AL2G10050.t1 206 212 134 98
## AL2G10060.t1 50 50 114 34
## AL2G10070.t1 34 38 8 42
## AL2G10080.t1 58 70 24 12
*create a dataframe detailing the above co-variate:sample relationship
e.g: # individual <- factor(c(1,1,2,2,3,3,4,4)) # OR
# sample level variates
sampleID <- factor(rep(c("ms01e", "ms02g", "ms03g", "ms04h"), each = 2))
sampleID
## [1] ms01e ms01e ms02g ms02g ms03g ms03g ms04h ms04h
## Levels: ms01e ms02g ms03g ms04h
# haplotype level variates
hapType <- factor(rep(c("My", "Sp"), 4))
hapType
## [1] My Sp My Sp My Sp My Sp
## Levels: My Sp
# maternal cytoplasm level variates
cytoplasm = factor(rep("Ma", 8))
cytoplasm
## [1] Ma Ma Ma Ma Ma Ma Ma Ma
## Levels: Ma
# family level variates
familyGroup <- factor(rep(c("e", "g", "g", "h"), each=2))
familyGroup
## [1] e e g g g g h h
## Levels: e g h
exp_factors = data.frame(sampleID = sampleID,
hapType = hapType,
familyGroup = familyGroup,
maternalEff = cytoplasm)
head(exp_factors)
## sampleID hapType familyGroup maternalEff
## 1 ms01e My e Ma
## 2 ms01e Sp e Ma
## 3 ms02g My g Ma
## 4 ms02g Sp g Ma
## 5 ms03g My g Ma
## 6 ms03g Sp g Ma
**Optional: if our interest is which genes show ASE differences across different sample IDs (or group) our experimental design would be: expDesign <- model.matrix(~0 + sampleID + hapType + sampleID:hapType)
expDesign <- model.matrix(~0 + sampleID + hapType)
expDesign
## sampleIDms01e sampleIDms02g sampleIDms03g sampleIDms04h hapTypeSp
## 1 1 0 0 0 0
## 2 1 0 0 0 1
## 3 0 1 0 0 0
## 4 0 1 0 0 1
## 5 0 0 1 0 0
## 6 0 0 1 0 1
## 7 0 0 0 1 0
## 8 0 0 0 1 1
## attr(,"assign")
## [1] 1 1 1 1 2
## attr(,"contrasts")
## attr(,"contrasts")$sampleID
## [1] "contr.treatment"
##
## attr(,"contrasts")$hapType
## [1] "contr.treatment"
colnames(expDesign)
## [1] "sampleIDms01e" "sampleIDms02g" "sampleIDms03g" "sampleIDms04h"
## [5] "hapTypeSp"
colData (explains the co-variates:sample relationship)
ASE_Matrix <- DESeqDataSetFromMatrix(countData = expressionData,
colData = exp_factors,
design = ~ 0 + sampleID + hapType)
ASE_Matrix
## class: DESeqDataSet
## dim: 1761 8
## metadata(1): version
## assays(1): counts
## rownames(1761): AL2G10020.t1 AL2G10030.t1 ... AL2G41810.t1 AL2G41820.t1
## rowData names(0):
## colnames(8): unqC_My.ms01e unqC_Sp.ms01e ... unqC_My.ms04h
## unqC_Sp.ms04h
## colData names(4): sampleID hapType familyGroup maternalEff
colnames(ASE_Matrix)
## [1] "unqC_My.ms01e" "unqC_Sp.ms01e" "unqC_My.ms02g" "unqC_Sp.ms02g"
## [5] "unqC_My.ms03g" "unqC_Sp.ms03g" "unqC_My.ms04h" "unqC_Sp.ms04h"
.. no normalization for any variabilty betweens samples (this is done so that any library size differences won’t absorb the ASE)
sizeFactors(ASE_Matrix) <- rep(1, 2*4)
There are two separate paths in this workflow; the one we will see first involves transformations of the counts in order to visually explore sample relationships. In the second part, we will go back to the original raw counts for statistical testing. This is critical because the statistical testing methods rely on original count data (not scaled or transformed) for calculating the precision of measurements.
Details on part-D: * Exploratory plots can be applied on raw counts to see how the data looks. This can help reveal if distribution of the data is as expected. * Plot like PCA, dispersion, etc. * Depending upon the data the raw data can be transformed as need be to improve exploration/diagnosis
This plot are made using: classic Plot, ggplot2 and functions within DESeq2 Raw counts, transformed raw counts, results are used to prepare this plots
Link: https://www.bioconductor.org/help/course-materials/2016/CSAMA/lab-3-rnaseq/rnaseq_gene_CSAMA2016.html#transformations Link: https://www.bioconductor.org/help/workflows/rnaseqGene/#the-rlog-and-variance-stabilizing-transformations
While the ASE test is done using “DESeq” function plotting of the raw data may cause some problems. Thus it is important to transform the data to make it more homoskedastic and appr. for showing in plots. We are using the rlog transformation since is it appr. for small sample (n<100)
rld.ASE_Matrix <- rlog(ASE_Matrix, blind = TRUE) # using rlog
vsd.ASE_Matrix <- vst(ASE_Matrix, blind = TRUE) # variance stabilizing transformation
Plot the first Haplotype against second haplotype for log2 transformation (simply add 1, to avoid log of zero).
df.toPlot <- bind_rows(
as_data_frame(log2(counts(ASE_Matrix, normalized=TRUE)[, 1:2]+1)) %>%
mutate(transformation = "log2(x + 1)"),
as_data_frame(assay(rld.ASE_Matrix)[, 1:2]) %>% mutate(transformation = "rlog"),
as_data_frame(assay(vsd.ASE_Matrix)[, 1:2]) %>% mutate(transformation = "vst"))
## Warning: `as_data_frame()` was deprecated in tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
colnames(df.toPlot)[1:2] <- c("My", "Sp")
df.toPlot <- bind_rows(
as_data_frame(log2(counts(ASE_Matrix, normalized=TRUE)[, 1:2]+1)) %>%
mutate(transformation = "log2(x + 1)"),
as_data_frame(assay(rld.ASE_Matrix)[, 1:2]) %>% mutate(transformation = "rlog"),
as_data_frame(assay(vsd.ASE_Matrix)[, 1:2]) %>% mutate(transformation = "vst"))
colnames(df.toPlot)[1:2] <- c("My", "Sp")
ggplot(df.toPlot, aes(x = My, y = Sp)) + geom_hex(bins = 80) +
coord_fixed() + facet_grid( . ~ transformation) +
ggtitle("Comparison of different transformation of raw RNAseq counts") +
theme(plot.title = element_text(size = 10, face = "bold"))
## Warning: Computation failed in `stat_binhex()`:
##
## Computation failed in `stat_binhex()`:
##
## Computation failed in `stat_binhex()`:
ggplot(df.toPlot) +
geom_density(aes(x = My), fill = "blue", alpha = 0.6) +
geom_density(aes(x = Sp), fill = "light blue", alpha = 0.6) + facet_grid(~ transformation)
ggplot(df.toPlot) +
geom_histogram(aes(x = My), fill = "blue", alpha = 0.6) +
geom_histogram(aes(x = Sp), fill = "light blue", alpha = 0.6) + facet_grid(~ transformation)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Result: The rlog transformation obviously looks better
Assess overall similarity between samples
Questions: * Are sample similar by haplotypes or are haplotype similar by sample? * Do we have roughly equal contribution from all genes?
sampleDists.ASE_Matrix <- dist(t(assay(rld.ASE_Matrix)))
# Results: We can see that a Sample-Haplotype is closest to it's own Sample-AltHaplotype
# then it is more closer to AltSample-Haplotype and then AltSample-AltHaplotype
# E.g My.ms01e is closest to Sp.ms01e, then to other My haplotype than Sp haplotype
# My.ms01e vs. Sp.ms01e = 41.85; vs. My.ms02g = 51.021; vs. Sp.ms02g = 62.23
# is there a way to visualize this sample distances?
plot(hclust(sampleDists.ASE_Matrix))
library("gplots" )
##
## Attaching package: 'gplots'
## The following object is masked from 'package:IRanges':
##
## space
## The following object is masked from 'package:S4Vectors':
##
## space
## The following object is masked from 'package:stats':
##
## lowess
library("RColorBrewer" )
library("pheatmap")
library("PoiClaClu") # For Poisson Distances
colors = colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
# Euclidean Distances
sampleDistMatrix <- as.matrix(sampleDists.ASE_Matrix)
rownames(sampleDistMatrix) <- paste( rld.ASE_Matrix$hapType ,
rld.ASE_Matrix$sampleID, sep="-" )
colnames(sampleDistMatrix) <- NULL
sampleDistMatrix
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## My-ms01e 0.00000 41.92941 51.35710 63.35913 48.13836 59.76878 49.75835
## Sp-ms01e 41.92941 0.00000 62.17238 59.62649 58.93902 50.94437 58.62307
## My-ms02g 51.35710 62.17238 0.00000 46.89185 51.70238 62.34089 47.72633
## Sp-ms02g 63.35913 59.62649 46.89185 0.00000 63.02487 58.14543 60.17662
## My-ms03g 48.13836 58.93902 51.70238 63.02487 0.00000 43.69786 47.11232
## Sp-ms03g 59.76878 50.94437 62.34089 58.14543 43.69786 0.00000 57.05899
## My-ms04h 49.75835 58.62307 47.72633 60.17662 47.11232 57.05899 0.00000
## Sp-ms04h 60.79595 53.07824 60.81861 57.91596 59.74291 51.55098 44.03559
## [,8]
## My-ms01e 60.79595
## Sp-ms01e 53.07824
## My-ms02g 60.81861
## Sp-ms02g 57.91596
## My-ms03g 59.74291
## Sp-ms03g 51.55098
## My-ms04h 44.03559
## Sp-ms04h 0.00000
pheatmap(sampleDistMatrix,
clustering_distance_rows = sampleDists.ASE_Matrix,
clustering_distance_cols = sampleDists.ASE_Matrix,
col = colors)
sampleDistDF <- as.data.frame(sampleDistMatrix)
colnames(sampleDistDF) <- c("My-ms01e", "Sp-ms01e", "My-ms02g", "Sp-ms02g",
"My-ms03g", "Sp-ms03g", "My-ms04h", "Sp-ms04h")
# Make columns from rownames
sampleDistDF$sample_1 <- rownames(sampleDistDF)
# Make long-format data frame
sampleDistDF <- sampleDistDF %>%
pivot_longer(!sample_1,
names_to = "sample_2",
values_to = "dist")
# GGplot Heatmap Distances
grDevices::png(filename = "htmp_dist.png", width = 12.5, height = 8, units = 'cm', res = 400)
htmp_dist <- ggplot(sampleDistDF, aes(sample_1, sample_2, fill = dist)) +
geom_tile() +
scale_fill_gradient(low = "#457b92", high = "#f5ffff") +
labs(title = "Heatmap: sample distances") +
theme_minimal() +
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
legend.title = element_blank(),
plot.title = element_text(size = 12),
axis.text.x = element_text(angle = 45, hjust=1)
)
htmp_dist
dev.off()
## quartz_off_screen
## 2
knitr::include_graphics("htmp_dist.png")
# this function takes original count matrix (not normalized) with samples as rows
poisDists <- PoissonDistance(t(counts(ASE_Matrix)))
samplePoisDistMatrix <- as.matrix(poisDists$dd)
rownames(samplePoisDistMatrix) <- paste(
rld.ASE_Matrix$hapType, rld.ASE_Matrix$sampleID, sep = "-")
colnames(samplePoisDistMatrix) <- NULL
samplePoisDistMatrix
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## My-ms01e 0.0000 500.7647 697.9457 1062.0223 624.6305 962.4118 649.6450
## Sp-ms01e 500.7647 0.0000 1068.3020 954.9788 931.9396 716.0597 933.0843
## My-ms02g 697.9457 1068.3020 0.0000 630.8040 747.1794 1080.7018 659.1200
## Sp-ms02g 1062.0223 954.9788 630.8040 0.0000 1086.9830 924.2734 997.0650
## My-ms03g 624.6305 931.9396 747.1794 1086.9830 0.0000 529.0677 597.6537
## Sp-ms03g 962.4118 716.0597 1080.7018 924.2734 529.0677 0.0000 877.0273
## My-ms04h 649.6450 933.0843 659.1200 997.0650 597.6537 877.0273 0.0000
## Sp-ms04h 978.3080 742.6169 1020.0877 900.0103 928.6833 690.0129 508.5604
## [,8]
## My-ms01e 978.3080
## Sp-ms01e 742.6169
## My-ms02g 1020.0877
## Sp-ms02g 900.0103
## My-ms03g 928.6833
## Sp-ms03g 690.0129
## My-ms04h 508.5604
## Sp-ms04h 0.0000
pheatmap(samplePoisDistMatrix,
clustering_distance_cols = poisDists$dd,
clustering_distance_rows = poisDists$dd,
col = colors)
samplePoisDistDF <- as.data.frame(samplePoisDistMatrix)
colnames(samplePoisDistDF) <- c("My-ms01e", "Sp-ms01e", "My-ms02g", "Sp-ms02g",
"My-ms03g", "Sp-ms03g", "My-ms04h", "Sp-ms04h")
# Make columns frow rownames
samplePoisDistDF$sample_1 <- rownames(samplePoisDistDF)
# Make long-format data frame
samplePoisDistDF <- samplePoisDistDF %>%
pivot_longer(!sample_1,
names_to = "sample_2",
values_to = "pois_dist")
grDevices::png(filename = "htmp_pois_dist.png", width = 12.5, height = 8, units = 'cm', res = 400)
htmp_pois_dist <- ggplot(samplePoisDistDF, aes(sample_1, sample_2, fill = pois_dist)) +
geom_tile() +
scale_fill_gradient(low = "#457b92", high = "#f5ffff") +
labs(title = "Heatmap: samples poisson distances") +
theme_minimal() +
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
legend.title = element_blank(),
plot.title = element_text(size = 12),
axis.text.x = element_text(angle = 45, hjust=1)
)
htmp_pois_dist
dev.off()
## quartz_off_screen
## 2
knitr::include_graphics("htmp_pois_dist.png")
Another way to visualize sample distances is PCA plot. Here the data points are projected on a 2D plane on which one plane explains the most variation while the other plane might explain second most variation
# Save the results of pca as dataframe
PcaDF <- plotPCA(rld.ASE_Matrix, intgroup = c("sampleID", "hapType"), returnData = TRUE)
# Plot pca-plot using DESeq2 package
plotPCA(rld.ASE_Matrix, intgroup = c("sampleID", "hapType"))
grDevices::png(filename = "pca.png", width = 12.5, height = 8, units = 'cm', res = 400)
pca <- ggplot(PcaDF,aes(x = PC1,y = PC2,
color = group,
shape = group)
) +
# Make a scatter plot
geom_point(size = 4) +
scale_color_manual("Sample:hapType",
values = c("#E63946","#457b92", "#E63946","#457b92", "#E63946","#457b92", "#E63946", "#457b92")) +
scale_shape_manual("Sample:hapType",
values = c(19, 19, 17, 17, 15, 15, 3, 3)) +
labs(
title = "PCA",
y = "PC2: 23% variance",
x = "PC1: 26% variance"
) +
theme_bw() +
theme(
panel.grid.major = element_line(colour = 'grey85', size = 0.2),
panel.grid.minor = element_line(colour = 'grey90', size = 0.1),
axis.title.x = element_text(size = 10),
axis.title.y = element_text(size = 10),
plot.title = element_text(size = 12),
#legend.position = c(0.8,0.2), #You can adjust legend position here
#legend.background = element_rect(color = "black")
)
pca
dev.off()
## quartz_off_screen
## 2
knitr::include_graphics("pca.png")
Result: there is more variatio between samples than between haplotypes. But the variation between samples in haplotype group is similar to variation between samples in another haplotype group. The sample-by-Haplotype group forms a very distinct cluster i.e same haplotypes are more similar from different samples.
This is very similar to PCA plot. We use MDS (multidimensional scaling) function R base. This is useful when we don’t have matrix of data of matrix of distances.
mds <- as.data.frame(colData(rld.ASE_Matrix)) %>%
cbind(cmdscale(sampleDistMatrix)) %>%
mutate(group = paste0(sampleID, ":", hapType))
grDevices::png(filename = "mds.png", width = 12.5, height = 8, units = 'cm', res = 400)
mds_plot <- ggplot(mds, aes(x = `1`, y = `2`, color = group, shape = group)) +
geom_point(size = 4) +
scale_color_manual("Sample:hapType",
values = c("#E63946","#457b92", "#E63946","#457b92", "#E63946","#457b92", "#E63946", "#457b92")) +
scale_shape_manual("Sample:hapType",
values = c(19, 19, 17, 17, 15, 15, 3, 3)) +
coord_fixed() +
labs(
title = "MDS"
#y = "",
#x = ""
) +
theme_bw() +
theme(
panel.grid.major = element_line(colour = 'grey85', size = 0.2),
panel.grid.minor = element_line(colour = 'grey90', size = 0.1),
axis.title.x = element_text(size = 10),
axis.title.y = element_text(size = 10),
plot.title = element_text(size = 12)
#legend.position = c(0.8,0.2), #You can adjust legend position here
#legend.background = element_rect(color = "black")
)
mds_plot
dev.off()
## quartz_off_screen
## 2
knitr::include_graphics("mds.png")
mdsPois <- as.data.frame(colData(ASE_Matrix)) %>%
cbind(cmdscale(samplePoisDistMatrix)) %>%
mutate(group = paste0(sampleID, ":", hapType))
grDevices::png(filename = "mds_pois.png", width = 12.5, height = 8, units = 'cm', res = 400)
mds_pois <- ggplot(mdsPois, aes(x = `1`, y = `2`, color = group, shape = group)) +
geom_point(size = 4) +
scale_color_manual("Sample:hapType",
values = c("#E63946","#457b92", "#E63946","#457b92", "#E63946","#457b92", "#E63946", "#457b92")) +
scale_shape_manual("Sample:hapType",
values = c(19, 19, 17, 17, 15, 15, 3, 3)) +
coord_fixed() +
labs(
title = "MDS: Poisson distances"
#y = "",
#x = ""
) +
theme_bw() +
theme(
panel.grid.major = element_line(colour = 'grey85', size = 0.2),
panel.grid.minor = element_line(colour = 'grey90', size = 0.1),
axis.title.x = element_text(size = 10),
axis.title.y = element_text(size = 10),
plot.title = element_text(size = 12),
#legend.position = c(0.8,0.2), #You can adjust legend position here
#legend.background = element_rect(color = "black")
)
mds_pois
dev.off()
## quartz_off_screen
## 2
knitr::include_graphics("mds_pois.png")
quantsDF <- counts.ForBinom %>%
group_by(sample) %>%
summarise(q1 = quantile(unqC_Sum, probs = 0.25),
q2 = quantile(unqC_Sum, probs = 0.50),
q3 = quantile(unqC_Sum, probs = 0.75))
quantsDF
## # A tibble: 4 × 4
## sample q1 q2 q3
## <chr> <dbl> <dbl> <dbl>
## 1 2ms01e 146 430 1074
## 2 2ms02g 152 486 1136
## 3 2ms03g 142 404 1050
## 4 2ms04h 134 436 1060.
countsWithQuants <- counts.ForBinom %>%
left_join(quantsDF %>% dplyr::select(sample, q3), by = "sample") %>%
mutate(high_exp = ifelse(unqC_Sum >= q3, 1, 0))
countsWithQuants %>% group_by(sample) %>%
summarise(n = sum(high_exp))
## # A tibble: 4 × 2
## sample n
## <chr> <dbl>
## 1 2ms01e 396
## 2 2ms02g 389
## 3 2ms03g 400
## 4 2ms04h 399
high_exp_01 <- subset(countsWithQuants, sample == "2ms01e" & high_exp == 1)$transcript_ID
high_exp_02 <- subset(countsWithQuants, sample == "2ms02g" & high_exp == 1)$transcript_ID
high_exp_03 <- subset(countsWithQuants, sample == "2ms03g" & high_exp == 1)$transcript_ID
high_exp_04 <- subset(countsWithQuants, sample == "2ms04h" & high_exp == 1)$transcript_ID
#list of lists with genes for each sample
input_high_exp <- list("2ms01e" = high_exp_01, "2ms02g" = high_exp_02, "2ms03g" =high_exp_03, "2ms04h" = high_exp_04)
library(ggvenn)
## Loading required package: grid
grDevices::png(filename = "venn_high_exp.png", width = 12.5, height = 8, units = 'cm', res = 400)
venn_high_exp <-ggvenn(
input_high_exp,
fill_color = c("#0073C2FF", "#EFC000FF", "#868686FF", "#CD534CFF"),
stroke_size = 0.4, set_name_size = 3, text_size = 3
)
venn_high_exp
dev.off()
## quartz_off_screen
## 2
knitr::include_graphics("venn_high_exp.png")
counts.ForBinom$binom_p = apply(counts.ForBinom[,c("unqC_My", "unqC_Sp")],
1, function(x)binom.test(x[1],x[1]+x[2],p=0.5, alternative = "two.sided")$p.value)
counts.ForBinom$p0.05 <- ifelse(counts.ForBinom$binom_p < 0.05, 1, 0)
counts.ForBinom$p0.01 <- ifelse(counts.ForBinom$binom_p < 0.01, 1, 0)
counts.ForBinom$p0.001 <- ifelse(counts.ForBinom$binom_p < 0.001, 1, 0)
dds.ASE_Data <- DESeq(ASE_Matrix, fitType = "local", betaPrior = FALSE)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
Details: Calling results without any arguments will extract the estimated log2 fold changes and p values for the last variable in the design formula. If there are more than 2 levels for this variable { as is the case in this analysis { results will extract the results table for a comparison of the last level over the first level.
# Set "Sp" counts as the reference level
result.ASE_Data <- results(
dds.ASE_Data, contrast = c("hapType", "My", "Sp"),
alpha = 0.05)
table(result.ASE_Data$padj < 0.1)
##
## FALSE TRUE
## 1191 570
summary(result.ASE_Data)
##
## out of 1761 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 267, 15%
## LFC < 0 (down) : 191, 11%
## outliers [1] : 0, 0%
## low counts [2] : 0, 0%
## (mean count < 2)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
If we want to make the sig genes threshold more stricter we can narrow down padj and/or also use higher L2FC
result.ASE_Data.05 <- results(
dds.ASE_Data, contrast = c("hapType", "My", "Sp"),
alpha = 0.05, lfcThreshold = 1)
table(result.ASE_Data.05$padj < 0.05)
##
## FALSE TRUE
## 1605 94
summary(result.ASE_Data.05)
##
## out of 1761 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 1.00 (up) : 62, 3.5%
## LFC < -1.00 (down) : 32, 1.8%
## outliers [1] : 0, 0%
## low counts [2] : 62, 3.5%
## (mean count < 6)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
(**Optional): If we want the results that have gene expression at least double or halving in My i.e lfc of 1 is 2^1 = 2 (double the expression)
result.ASE_Data.LFC1 <- results(dds.ASE_Data, lfcThreshold = 1,
contrast = c("hapType", "My", "Sp"))
table(result.ASE_Data.LFC1$padj < 0.05)
##
## FALSE TRUE
## 1670 91
summary(result.ASE_Data.LFC1)
##
## out of 1761 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 1.00 (up) : 64, 3.6%
## LFC < -1.00 (down) : 35, 2%
## outliers [1] : 0, 0%
## low counts [2] : 0, 0%
## (mean count < 2)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# Results as dataframe
result.ASE.asDF <- as.data.frame(result.ASE_Data)
result.ASE.LFC1.asDF <- as.data.frame(result.ASE_Data.LFC1)
result.ASE.asDF <- rownames_to_column(
result.ASE.asDF, var = "transcript_ID")
result.ASE.LFC1.asDF <- rownames_to_column(
result.ASE.LFC1.asDF , var = "transcript_ID")
head(result.ASE.asDF)
## transcript_ID baseMean log2FoldChange lfcSE stat pvalue
## 1 AL2G10020.t1 1748.25 -2.2956110 1.2112172 -1.8952925 0.0580536565
## 2 AL2G10030.t1 1089.75 0.6501060 0.1675992 3.8789320 0.0001049161
## 3 AL2G10050.t1 160.25 0.7450848 0.8984919 0.8292616 0.4069564221
## 4 AL2G10060.t1 50.00 0.6378227 0.4900287 1.3016026 0.1930522825
## 5 AL2G10070.t1 23.75 -0.9689420 0.7103124 -1.3641068 0.1725339419
## 6 AL2G10080.t1 47.75 0.1333075 0.3683307 0.3619235 0.7174092155
## padj
## 1 0.1577661868
## 2 0.0009521844
## 3 0.6059248002
## 4 0.3687256720
## 5 0.3421738043
## 6 0.8530493180
head(result.ASE.LFC1.asDF)
## transcript_ID baseMean log2FoldChange lfcSE stat pvalue padj
## 1 AL2G10020.t1 1748.25 -2.2956110 1.2112172 -1.069677 0.2847648 1
## 2 AL2G10030.t1 1089.75 0.6501060 0.1675992 0.000000 1.0000000 1
## 3 AL2G10050.t1 160.25 0.7450848 0.8984919 0.000000 1.0000000 1
## 4 AL2G10060.t1 50.00 0.6378227 0.4900287 0.000000 1.0000000 1
## 5 AL2G10070.t1 23.75 -0.9689420 0.7103124 0.000000 1.0000000 1
## 6 AL2G10080.t1 47.75 0.1333075 0.3683307 0.000000 1.0000000 1
Here we check how many genes from general ASE results table inter
sign_results <- subset(result.ASE.asDF, padj <= 0.001)$transcript_ID
sign_results_lfc1 <-subset(result.ASE.LFC1.asDF, padj <= 0.001)$transcript_ID
input_ase_results <- list("No threshold" = sign_results, "LFC Threshold = 1" = sign_results_lfc1)
grDevices::png(filename = "venn_ase_sign.png", width = 12.5, height = 8, units = 'cm', res = 400)
venn_ase_sign <- ggvenn(
input_ase_results,
fill_color = c("#0073C2FF", "#EFC000FF"),
stroke_size = 0.4, set_name_size = 3, text_size = 3
)
venn_ase_sign
dev.off()
## quartz_off_screen
## 2
knitr::include_graphics("venn_ase_sign.png")
Results: basically, the results seem to be obvious. When applying a threshold we simply have lower number of genes with adjusted p-value on the same level.
This extra step helps to add more metadata information to the “results” for better analyses and plotting
geneMap <- counts.ms1e2g3g4h.filt01 %>%
distinct(start, transcript_ID, .keep_all = TRUE) %>%
data.frame(row.names = "transcript_ID") %>%
mutate(
unqC_My.total = (unqC_My.ms01e + unqC_My.ms02g + unqC_My.ms03g + unqC_My.ms04h),
unqC_Sp.total = (unqC_Sp.ms01e + unqC_Sp.ms02g + unqC_Sp.ms03g + unqC_Sp.ms04h)) %>%
select(contig, start, end, gene_Name, gene_ID, unqC_My.total, unqC_Sp.total)
# Note: this geneMap database should have exact same number of rows as "results"
# plus the "rownames" should match between "results" and "geneMap"
head(geneMap)
## contig start end gene_Name gene_ID unqC_My.total unqC_Sp.total
## AL2G10020.t1 2 1116 3595 - AL2G10020 3974 10012
## AL2G10030.t1 2 5226 7090 CP5 AL2G10030 5426 3292
## AL2G10050.t1 2 22602 24361 - AL2G10050 582 700
## AL2G10060.t1 2 29596 30510 - AL2G10060 242 158
## AL2G10070.t1 2 46995 48041 BLT AL2G10070 68 122
## AL2G10080.t1 2 48700 50081 - AL2G10080 188 194
result.ASE_Data.withGeneMap <- cbind(result.ASE_Data, geneMap)
# Result/Note: this geneMap database should have exact same number of rows as "results"
# plus the "rownames" should match between "results" and "geneMap"
nrow(result.ASE_Data) == nrow(result.ASE_Data.withGeneMap)
## [1] TRUE
result.withGeneMap.asDF <- as.data.frame(result.ASE_Data.withGeneMap,
row.names = rownames(result.ASE_Data.withGeneMap))
result.withGeneMap.asDF <-
mutate(result.withGeneMap.asDF,
Significance = if_else(padj <= 0.01 & abs(log2FoldChange) >= 4, "|L2FC| > 4 & padj < 0.01",
if_else(padj <= 0.05 & abs(log2FoldChange) >= 2, "|L2FC| > 2 & padj < 0.05", "non-Sig")),
unqC_total = unqC_My.total + unqC_Sp.total)
Now, we start making plot either using “results” table or updated results table
Plot of the raw read counts stratified by two haplotypes for specific gene
topGene, PIN1, PIN3, TCP15 - Plot
topGene.data <- plotCounts(dds.ASE_Data, gene=which.min(result.ASE_Data.withGeneMap$padj),
intgroup="hapType", returnData=TRUE)
result.withGeneMap.asDF %>%
filter(padj == min(padj))
## baseMean log2FoldChange lfcSE stat pvalue
## AL2G19580.t1 371 2.599961 0.1739049 14.95048 1.546286e-50
## padj contig start end gene_Name gene_ID
## AL2G19580.t1 2.72301e-47 2 7856651 7857947 - AL2G19580
## unqC_My.total unqC_Sp.total Significance unqC_total
## AL2G19580.t1 2544 424 |L2FC| > 2 & padj < 0.05 2968
grDevices::png(filename = "top_gene.png", width = 12.5, height = 8, units = 'cm', res = 400)
top_gene <- ggplot(topGene.data, aes(x = hapType, y = count, color = sampleID, group = sampleID)) +
scale_y_log10() + geom_point(size = 2, position=position_dodge(0.4)) +
geom_line(position=position_dodge(0.4), size = 1) +
labs(title = "Raw counts for the Gene with the least p-vaue",
x = "Haplotype", y = "Raw counts") +
scale_color_manual("Sample", values = c("#7F58AF", "#64C5EB", "#E84D8A", "#FEB326")) +
scale_x_discrete(limits = c("Sp", "My")) +
theme_bw() +
theme(axis.text=element_text(size=12),
axis.title=element_text(size=10),
plot.title = element_text(size = rel(1)),
panel.grid.major = element_line(colour = 'grey85', size = 0.2),
panel.grid.minor = element_line(colour = 'grey90', size = 0.1),
axis.title.x = element_text(size = 10),
axis.title.y = element_text(size = 10),
#legend.position = c(0.8,0.2), #You can adjust legend position here
#legend.background = element_rect(color = "black")
)
top_gene
dev.off()
## quartz_off_screen
## 2
knitr::include_graphics("top_gene.png")
Result: this plots the raw counts by Haplotype for the given gene the points are jittered to prevent overlaps the lines are connected between Haplotypes of same samples.
d.PIN3 <- plotCounts(
dds.ASE_Data,
gene=which(result.ASE_Data.withGeneMap$gene_Name == "PIN3"),
intgroup="hapType", returnData=TRUE)
grDevices::png(filename = "pin3.png", width = 12.5, height = 8, units = 'cm', res = 400)
pin3 <- ggplot(d.PIN3, aes(x = hapType, y = count, color = sampleID, group = sampleID)) +
scale_y_log10() + geom_point(size = 2, position=position_dodge(0.4)) +
geom_line(position=position_dodge(0.4), size = 1) +
labs(title = "Raw counts for PIN3",
x = "Haplotype", y = "Raw counts") +
scale_color_manual("Sample", values = c("#7F58AF", "#64C5EB", "#E84D8A", "#FEB326")) +
theme_bw() +
theme(axis.text=element_text(size=12),
axis.title=element_text(size=10),
plot.title = element_text(size = rel(1)),
panel.grid.major = element_line(colour = 'grey85', size = 0.2),
panel.grid.minor = element_line(colour = 'grey90', size = 0.1),
axis.title.x = element_text(size = 10),
axis.title.y = element_text(size = 10),
#legend.position = c(0.8,0.2), #You can adjust legend position here
#legend.background = element_rect(color = "black")
)
pin3
dev.off()
## quartz_off_screen
## 2
knitr::include_graphics("pin3.png")
subset(result.withGeneMap.asDF, gene_Name == "PIN3")$padj
## [1] 1.053251e-07
d.PIN1 <- plotCounts(
dds.ASE_Data,
gene=which(result.ASE_Data.withGeneMap$gene_Name == "PIN1"),
intgroup="hapType", returnData=TRUE)
grDevices::png(filename = "pin1.png", width = 12.5, height = 8, units = 'cm', res = 400)
pin1 <- ggplot(d.PIN1, aes(x = hapType, y = count, color = sampleID, group = sampleID)) +
scale_y_log10() + geom_point(size = 2, position=position_dodge(0.4)) +
geom_line(position=position_dodge(0.4), size = 1) +
labs(title = "Raw counts for PIN1",
x = "Haplotype", y = "Raw counts") +
scale_color_manual("Sample", values = c("#7F58AF", "#64C5EB", "#E84D8A", "#FEB326")) +
theme_bw() +
theme(axis.text=element_text(size=12),
axis.title=element_text(size=10),
plot.title = element_text(size = rel(1)),
panel.grid.major = element_line(colour = 'grey85', size = 0.2),
panel.grid.minor = element_line(colour = 'grey90', size = 0.1),
axis.title.x = element_text(size = 10),
axis.title.y = element_text(size = 10),
#legend.position = c(0.8,0.2), #You can adjust legend position here
#legend.background = element_rect(color = "black")
)
pin1
dev.off()
## quartz_off_screen
## 2
knitr::include_graphics("pin1.png")
subset(result.withGeneMap.asDF, gene_Name == "PIN1")$padj
## [1] 0.0809853
d.TCP15 <- plotCounts(
dds.ASE_Data,
gene=which(result.ASE_Data.withGeneMap$gene_ID == "AL2G28860"),
intgroup="hapType", returnData=TRUE)
grDevices::png(filename = "tcp15.png", width = 12.5, height = 8, units = 'cm', res = 400)
tcp15 <- ggplot(d.TCP15, aes(x = hapType, y = count, color = sampleID, group = sampleID)) +
scale_y_log10() + geom_point(size = 2, position=position_dodge(0.4)) +
geom_line(position=position_dodge(0.4), size = 1) +
labs(title = "Raw counts for TCP15",
x = "Haplotype", y = "Raw counts") +
scale_color_manual("Sample", values = c("#7F58AF", "#64C5EB", "#E84D8A", "#FEB326")) +
scale_x_discrete(limits = c("Sp", "My")) +
theme_bw() +
theme(axis.text=element_text(size=12),
axis.title=element_text(size=10),
plot.title = element_text(size = rel(1)),
panel.grid.major = element_line(colour = 'grey85', size = 0.2),
panel.grid.minor = element_line(colour = 'grey90', size = 0.1),
axis.title.x = element_text(size = 10),
axis.title.y = element_text(size = 10),
#legend.position = c(0.8,0.2), #You can adjust legend position here
#legend.background = element_rect(color = "black")
)
tcp15
dev.off()
## quartz_off_screen
## 2
knitr::include_graphics("tcp15.png")
subset(result.withGeneMap.asDF, gene_ID == "AL2G28860")$padj
## [1] 0.9814344
d.TCP22 <- plotCounts(
dds.ASE_Data,
gene=which(result.ASE_Data.withGeneMap$gene_ID == "AL2G31640"),
intgroup="hapType", returnData=TRUE)
grDevices::png(filename = "tcp22.png", width = 12.5, height = 8, units = 'cm', res = 400)
tcp22 <- ggplot(d.TCP22, aes(x = hapType, y = count, color = sampleID, group = sampleID)) +
scale_y_log10() + geom_point(size = 2, position=position_dodge(0.4)) +
geom_line(position=position_dodge(0.4), size = 1) +
labs(title = "Raw counts for TCP22",
x = "Haplotype", y = "Raw counts") +
scale_color_manual("Sample", values = c("#7F58AF", "#64C5EB", "#E84D8A", "#FEB326")) +
scale_x_discrete(limits = c("Sp", "My")) +
theme_bw() +
theme(axis.text=element_text(size=12),
axis.title=element_text(size=10),
plot.title = element_text(size = rel(1)),
panel.grid.major = element_line(colour = 'grey85', size = 0.2),
panel.grid.minor = element_line(colour = 'grey90', size = 0.1),
axis.title.x = element_text(size = 10),
axis.title.y = element_text(size = 10),
#legend.position = c(0.8,0.2), #You can adjust legend position here
#legend.background = element_rect(color = "black")
)
tcp22
dev.off()
## quartz_off_screen
## 2
knitr::include_graphics("tcp22.png")
subset(result.withGeneMap.asDF, gene_ID == "AL2G31640")$padj
## [1] 0.8161894
d.AP1 <- plotCounts(
dds.ASE_Data,
gene=which(result.ASE_Data.withGeneMap$gene_ID == "AL2G28180"),
intgroup="hapType", returnData=TRUE)
grDevices::png(filename = "ap1.png", width = 12.5, height = 8, units = 'cm', res = 400)
ap1 <- ggplot(d.AP1, aes(x = hapType, y = count, color = sampleID, group = sampleID)) +
scale_y_log10() + geom_point(size = 2, position=position_dodge(0.4)) +
geom_line(position=position_dodge(0.4), size = 1) +
labs(title = "Raw counts for AP1",
x = "Haplotype", y = "Raw counts") +
scale_color_manual("Sample", values = c("#7F58AF", "#64C5EB", "#E84D8A", "#FEB326")) +
scale_x_discrete(limits = c("Sp", "My")) +
theme_bw() +
theme(axis.text=element_text(size=12),
axis.title=element_text(size=10),
plot.title = element_text(size = rel(1)),
panel.grid.major = element_line(colour = 'grey85', size = 0.2),
panel.grid.minor = element_line(colour = 'grey90', size = 0.1),
axis.title.x = element_text(size = 10),
axis.title.y = element_text(size = 10),
#legend.position = c(0.8,0.2), #You can adjust legend position here
#legend.background = element_rect(color = "black")
)
ap1
dev.off()
## quartz_off_screen
## 2
knitr::include_graphics("ap1.png")
subset(result.withGeneMap.asDF, gene_ID == "AL2G28180")$padj
## [1] 0.9107402
Here we make categories of genes depending on the Log2FoldChange and p-value adjusted.
The attitude is as follows:
We take < 0.6 and > 0.6 for Log2FoldChange according to this link: https://biocorecrg.github.io/CRG_RIntroduction/volcano-plots.html
In addition, we make separate groups for significance which derived purely according to p-values.
# Make a copy of the DESeq2 results table
result.ASE.copy <- result.ASE.asDF
# -Log10 of p-value adjusted
result.ASE.copy$padj_log10 <- -log10(result.ASE.copy$padj)
# Make groups of genes depending on the
result.ASE.copy <- result.ASE.copy %>%
mutate(
cat = case_when(
log2FoldChange < -0.6 & padj <= 0.05 ~ "Lower",
log2FoldChange > 0.6 & padj <= 0.05 ~ "Upper",
TRUE ~ "NS"),
sign = case_when(
padj <= 0.001 ~ "< 0.001",
padj <= 0.01 ~ "< 0.01",
padj <= 0.05 ~ "< 0.05",
padj <= 0.1 ~ "< 0.1",
TRUE ~ "ns")
)
# Check categories
knitr::kable(table(result.ASE.copy$cat))
| Var1 | Freq |
|---|---|
| Lower | 162 |
| NS | 1377 |
| Upper | 222 |
We interpret the table as follows:
| category | p-value interval |
|---|---|
| < 0.001 | p-value <= 0.001 |
| < 0.01 | 0.001 < p-value <= 0.01 |
| < 0.05 | 0.01 < p-value <= 0.05 |
| < 0.01 | 0.05 < p-value <= 0.1 |
| ns | p-value > 0.1 |
knitr::kable(table(result.ASE.copy$sign))
| Var1 | Freq |
|---|---|
| < 0.001 | 196 |
| < 0.01 | 90 |
| < 0.05 | 172 |
| < 0.1 | 112 |
| ns | 1191 |
result.ASE.copy %>%
group_by(sign) %>%
summarise(n = n()) %>%
ggplot(aes(x = sign, y = n, fill = sign)) +
geom_bar(stat = "identity") +
geom_text(aes(label = n), vjust = 2) +
labs(x = "Significance level",
y = "Count") +
scale_fill_manual("Sign. level",
values = c("#457b92", "#6f9baa", "#9abcc4", "#c7dde0", "#9e9e9e")) +
theme_bw() +
theme(
panel.grid.major = element_line(colour = 'grey85', size = 0.2),
panel.grid.minor = element_line(colour = 'grey90', size = 0.1),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14),
plot.title = element_text(size = 16),
axis.text.x = element_text(angle = 45, hjust=1),
#legend.position = c(0.8,0.2), #You can adjust legend position here
#legend.background = element_rect(color = "black")
)
grDevices::png(filename = "volc_wald.png", width = 12.5, height = 8, units = 'cm', res = 400)
# Volcano Plot
volc_wald <- result.ASE.copy %>%
ggplot(mapping = aes(x = log2FoldChange, y = padj_log10)) +
geom_point(data = result.ASE.copy %>% filter(cat == "Lower"), aes(color = sign), alpha = 0.9) +
scale_color_manual("My < Sp", #legend title
values = c("#457b92","#7eb1c7","#b8eaff"), #colors for sign_level
guide = guide_legend(override.aes = list(size = 4)) #size of the points inside legend
) +
ggnewscale::new_scale_color() + # Need to add new gradient scale
# Beyond abline
geom_point(data = result.ASE.copy %>% filter(cat == "Upper"), aes(color = sign), alpha = 0.9) +
scale_color_manual("My > Sp",
values = c("#E63946","#f98e89","#ffd6d2"),
guide = guide_legend(override.aes = list(size = 4))
) +
ggnewscale::new_scale_color() +
# Not significant
geom_point(data = result.ASE.copy %>% filter(cat == "NS"), aes(color = "Not sign."), size = 1, alpha = 0.8) +
scale_color_manual("",values= "#525252", labels = "Not sign.",
guide = guide_legend(override.aes = list(size = 4))
) +
#ggnewscale::new_scale_color() +
labs(x = "Log2 Folds Change",
y = "-log10 p-value",
title = "Volcano plot") +
theme_bw() +
theme(
panel.grid.major = element_line(colour = 'grey85', size = 0.2),
panel.grid.minor = element_line(colour = 'grey90', size = 0.1),
axis.title.x = element_text(size = 10),
axis.title.y = element_text(size = 10),
plot.title = element_text(size = 12),
#legend.key = element_rect(color = "black")
legend.title = element_text(size = 10),
legend.text = element_text(size = 10),
legend.spacing.y = unit(0.05, "cm")
)
volc_wald
dev.off()
## quartz_off_screen
## 2
knitr::include_graphics("volc_wald.png")
library(ggpubr)
##
## Attaching package: 'ggpubr'
## The following object is masked from 'package:plyr':
##
## mutate
grDevices::png(filename = "ma_pubr.png", width = 12.5, height = 8, units = 'cm', res = 400)
ma_pubr <- ggpubr::ggmaplot(result.ASE_Data.withGeneMap, fdr = 0.05, fc = 1.5, size = 0.4,
main = "MA plot", legend = "top", top = 20,
palette = c("#B31B21", "#1465AC", "darkgray"),
genenames = as.vector(if_else(result.ASE_Data.withGeneMap$padj < 0.001, result.ASE_Data.withGeneMap$gene_Name, NULL)),
font.label = c("bold", 6), font.legend = c("bold", 8), font.main = c("bold", 8),
font.x = 8, font.y = 8,
ggtheme = ggplot2::theme_bw())
ma_pubr
dev.off()
## quartz_off_screen
## 2
knitr::include_graphics("ma_pubr.png")
grDevices::png(filename = "md_wald.png", width = 12.5, height = 8, units = 'cm', res = 400)
md_wald <- result.ASE.copy %>%
ggplot(mapping = aes(y = log2FoldChange, x = baseMean)) +
geom_point(data = result.ASE.copy %>% filter(cat == "Lower"), aes(color = sign), alpha = 0.9) +
scale_color_manual("My < Sp", #legend title
values = c("#457b92","#7eb1c7","#b8eaff"), #colors for sign_level
guide = guide_legend(override.aes = list(size = 4)) #size of the points inside legend
) +
ggnewscale::new_scale_color() + # Need to add new gradient scale
# Beyond abline
geom_point(data = result.ASE.copy %>% filter(cat == "Upper"), aes(color = sign), alpha = 0.9) +
scale_color_manual("My > Sp",
values = c("#E63946","#f98e89","#ffd6d2"),
guide = guide_legend(override.aes = list(size = 4))
) +
ggnewscale::new_scale_color() +
# Not significant
geom_point(data = result.ASE.copy %>% filter(cat == "NS"), aes(color = "Not sign."), size = 1, alpha = 0.8) +
scale_color_manual("",values= "#525252", labels = "Not sign.",
guide = guide_legend(override.aes = list(size = 4))
) +
#ggnewscale::new_scale_color() +
labs(y = "Log2 Folds Change",
x = "Log2 mean expression",
title = "MA Plot") +
scale_x_continuous(trans = "log2") +
theme_bw() +
theme(
panel.grid.major = element_line(colour = 'grey85', size = 0.2),
panel.grid.minor = element_line(colour = 'grey90', size = 0.1),
axis.title.x = element_text(size = 10),
axis.title.y = element_text(size = 10),
plot.title = element_text(size = 12),
#legend.key = element_rect(color = "black")
legend.title = element_text(size = 10),
legend.text = element_text(size = 10),
legend.spacing.y = unit(0.05, "cm")
)
md_wald
dev.off()
## quartz_off_screen
## 2
knitr::include_graphics("md_wald.png")
plotDispEsts(dds.ASE_Data, ylim = c(1e-6, 1e1))
hist( result.ASE_Data.withGeneMap$pvalue,
breaks=0:20/20, col="grey50", border = "white",
main = "Histogram of p-values", xlab = "p-value")
This plot gene expression (log2FoldChanges i.e ASE) by chromosome and genomic position. Then color the points where the p-adj are significant.
ggplot(result.withGeneMap.asDF, aes(x=start, y= log2FoldChange)) +
labs(title = "Allele Expression Balance Plot: My vs. Sp", x = "Genome Position",
y = "log2FoldChange: \nMy haplotype vs. Sp haplotype") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5), legend.position = "bottom") +
geom_hline(yintercept = c(-4, -2, 2, 4) ,
color = c("darkgreen", "darkred", "darkred", "darkgreen")) +
geom_point(aes(x=start, y= log2FoldChange,
color = Significance, shape = Significance,
size = padj),
position=position_jitter(w=0.1,h=0)) +
scale_shape_manual(values=c(3, 4, 5)) #+
#scale_color_manual(values = c("green", "red", "black"))
This helps us to show how the log2(unique counts) changes for My haplotype vs. Sp haplotype.
ggplot(result.withGeneMap.asDF, aes(x=log2(unqC_Sp.total+1) , y= log2(unqC_My.total+1))) +
labs(title = "log2 X-Y plot", x = "log2(# of unique Sp haplotype reads + 1)",
y = "log2(# of unique My haplotype reads + 1)") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5), legend.position = "bottom") +
geom_abline(slope = 1, intercept = 0, color = "darkred")+
geom_point(aes(x=log2(unqC_Sp.total+1) , y= log2(unqC_My.total+1),
color = Significance, shape = Significance,
size = padj),
position=position_jitter(w=0.1,h=0)) +
scale_shape_manual(values=c(3, 4, 5)) #+
#scale_color_manual(name = "legend of significance")
ggplot(result.withGeneMap.asDF, aes(x = start, y = -log10(padj))) +
geom_point(aes(x = start, y = -log10(padj)))
counts_2ms01e.joinTest <- counts_2ms01e %>%
distinct(transcript_ID, .keep_all = TRUE)
counts_2ms02g.joinTest <- counts_2ms02g %>%
distinct(transcript_ID, .keep_all = TRUE)
counts_2ms03g.joinTest <- counts_2ms03g %>%
distinct(transcript_ID, .keep_all = TRUE)
counts_2ms04h.joinTest <- counts_2ms04h %>%
distinct(transcript_ID, .keep_all = TRUE)
# Union samples verically
data_all <- rbind(counts_2ms01e.joinTest,
counts_2ms02g.joinTest,
counts_2ms03g.joinTest,
counts_2ms04h.joinTest)
# Get the variable which represent the QTL region
data_all <- data_all %>%
mutate(unqC_Sum = unqC_My + unqC_Sp,
qtl_pos = ifelse(start >= 12875963 & start <= 16344528, 1, 0))
This data will than be used for labelling genes.
max_exp <- data_all %>%
select(transcript_ID, gene_ID, unqC_My, unqC_Sp) %>%
pivot_longer(c(unqC_My, unqC_Sp), names_to = "allele", values_to = "exp") %>%
group_by(transcript_ID) %>%
summarise(max_exp = max(exp))
genes_20 <- result.withGeneMap.asDF %>%
arrange(padj) %>%
select(gene_ID, start, padj) %>%
tibble::rownames_to_column("transcript_ID") %>%
head(20) %>%
left_join(max_exp, by = "transcript_ID") %>%
mutate(max_log = log2(max_exp + 1) + 1,
qtl_pos = ifelse(start >= 12875963 & start <= 16344528, 1, 0)) %>%
arrange(start)
data_bx_20 <- data_all %>%
filter(gene_ID %in% genes_20$gene_ID) %>%
select(transcript_ID, gene_ID, start, unqC_My, unqC_Sp) %>%
pivot_longer(c(unqC_My, unqC_Sp), names_to = "allele", values_to = "exp") %>%
mutate(log_exp = log2(exp + 1),
qtl_pos = ifelse(start >= 12875963 & start <= 16344528, 1, 0))
genes_20_table <- data_all %>%
filter(gene_ID %in% genes_20$gene_ID) %>%
select(transcript_ID, unqC_My, unqC_Sp) %>%
group_by(transcript_ID) %>%
summarise(mean_My = mean(unqC_My),
sd_My = sd(unqC_My),
mean_Sp = mean(unqC_Sp),
sd_Sp = sd(unqC_Sp)) %>%
left_join(result.withGeneMap.asDF %>%
tibble::rownames_to_column("transcript_ID") %>%
select(transcript_ID, gene_ID, padj, start),
by = "transcript_ID") %>%
mutate(qtl_pos = ifelse(start >= 12875963 & start <= 16344528, 1, 0))
In the part F-05 II and following parts F-05 III and F-05 IV we build “Big Boxplots”. These boxplots contain all the genes which follow a specific condidition of significance for difference in allele expression.
This critereas: * Big Boxplot №1 - all the genes which have Wald-test p-value < 0.001 * Big Boxplot №2 - all the genes which are in category: |L2FC| > 2 & padj < 0.05 * Big Boxplot №3 - all the genes which are in category: |L2FC| > 4 & padj < 0.01
colors <- c("#00A5E3", "#8DD7BF", "#FF96C5", "#FF5768", "#FFBF65", "#00B0BA")
genes_1 <- result.withGeneMap.asDF %>%
filter(padj <= 0.001) %>%
select(gene_ID, start, padj) %>%
tibble::rownames_to_column("transcript_ID") %>%
left_join(max_exp, by = "transcript_ID") %>%
mutate(max_log = log2(max_exp + 1) + 1,
qtl_pos = ifelse(start >= 12875963 & start <= 16344528, 1, 0)) %>%
arrange(start)
data_bigbx1 <- data_all %>%
filter(transcript_ID %in% genes_1$transcript_ID) %>%
select(transcript_ID, gene_ID, start, unqC_My, unqC_Sp, unqC_Sum) %>%
#pivot_longer(c(unqC_My, unqC_Sp), names_to = "allele", values_to = "exp") %>%
mutate(
#log_exp = log2(exp + 1),
qtl_pos = ifelse(start >= 12875963 & start <= 16344528, 1, 0))
Threshold in this case was chosen visually to keep only five boxplots filled with color for each of the plots.The same approach works for the other two big boxplots.
gene_names1 <- data_bigbx1 %>%
group_by(start, gene_ID) %>%
dplyr::summarise(max = max(unqC_Sum)) %>%
filter(max > 9000)
## `summarise()` has grouped output by 'start'. You can override using the
## `.groups` argument.
high_boxplots1 <- data_bigbx1 %>%
group_by(start, gene_ID) %>%
filter(max(unqC_Sum) > 9000)
genes_2 <- result.withGeneMap.asDF %>%
filter(Significance == "|L2FC| > 2 & padj < 0.05") %>%
select(gene_ID, start, padj) %>%
tibble::rownames_to_column("transcript_ID") %>%
left_join(max_exp, by = "transcript_ID") %>%
mutate(max_log = log2(max_exp + 1) + 1,
qtl_pos = ifelse(start >= 12875963 & start <= 16344528, 1, 0)) %>%
arrange(start)
data_bigbx2 <- data_all %>%
filter(transcript_ID %in% genes_2$transcript_ID) %>%
select(transcript_ID, gene_ID, start, unqC_My, unqC_Sp, unqC_Sum) %>%
#pivot_longer(c(unqC_My, unqC_Sp), names_to = "allele", values_to = "exp") %>%
mutate(
#log_exp = log2(exp + 1),
qtl_pos = ifelse(start >= 12875963 & start <= 16344528, 1, 0))
gene_names2 <- data_bigbx2 %>%
group_by(start, gene_ID) %>%
dplyr::summarise(max = max(unqC_Sum)) %>%
filter(max > 3700)
## `summarise()` has grouped output by 'start'. You can override using the
## `.groups` argument.
high_boxplots2 <- data_bigbx2 %>%
group_by(start, gene_ID) %>%
filter(max(unqC_Sum) > 3700)
genes_3 <- result.withGeneMap.asDF %>%
filter(Significance == "|L2FC| > 4 & padj < 0.01") %>%
select(gene_ID, start, padj) %>%
tibble::rownames_to_column("transcript_ID") %>%
left_join(max_exp, by = "transcript_ID") %>%
mutate(max_log = log2(max_exp + 1) + 1,
qtl_pos = ifelse(start >= 12875963 & start <= 16344528, 1, 0)) %>%
arrange(start)
data_bigbx3 <- data_all %>%
filter(transcript_ID %in% genes_3$transcript_ID) %>%
select(transcript_ID, gene_ID, start, unqC_My, unqC_Sp, unqC_Sum) %>%
#pivot_longer(c(unqC_My, unqC_Sp), names_to = "allele", values_to = "exp") %>%
mutate(
#log_exp = log2(exp + 1),
qtl_pos = ifelse(start >= 12875963 & start <= 16344528, 1, 0))
gene_names3 <- data_bigbx3 %>%
group_by(start, gene_ID) %>%
dplyr::summarise(max = max(unqC_Sum)) %>%
filter(max > 2100)
## `summarise()` has grouped output by 'start'. You can override using the
## `.groups` argument.
high_boxplots3 <- data_bigbx3 %>%
group_by(start, gene_ID) %>%
filter(max(unqC_Sum) > 2100)
grDevices::png(filename = "bx_20_v2.png", width = 16.5, height = 10, units = 'cm', res = 400)
bx_20_v2 <- data_bx_20 %>%
ggplot(mapping = aes(x = as.factor(start), y = log_exp, fill = as.factor(qtl_pos))) +
geom_boxplot(alpha = 0.3) +
geom_point(aes(color = allele),
#position = position_jitter(),
size = 1.3) +
stat_summary(fun.y=mean, geom="point", shape=20, size=4, color="#FFA500", fill="#FFA500") +
scale_color_manual("Haplotype",
values = c("#E63946", "#457b92"),
labels = c("My", "Sp")) +
scale_fill_manual("QTL region", values = c("White", "#00A86B"),
labels = c("out", "in")) +
scale_x_discrete(labels = genes_20$gene_ID) +
labs(
#title = "Top 20 genes with highest gene expression",
y = "Log2(Expression + 1)",
x = "Gene ID"
) +
theme_bw() +
theme(
panel.grid.major = element_line(colour = 'grey85', size = 0.2),
panel.grid.minor = element_line(colour = 'grey90', size = 0.1),
axis.title.x = element_blank(),
axis.title.y = element_text(size = 10),
plot.title = element_blank(),
axis.text.x = element_text(angle = 45, hjust=1),
#legend.position = c(0.8,0.2), #You can adjust legend position here
legend.background = element_rect(color = "black"),
legend.title = element_text(size = 8),
legend.text = element_text(size = 8)
)
## Warning: `fun.y` is deprecated. Use `fun` instead.
bx_20_v2
dev.off()
## quartz_off_screen
## 2
knitr::include_graphics("bx_20_v2.png")
grDevices::png(filename = "bx_big1.png", width = 16.5, height = 10, units = 'cm', res = 400)
bx_big1 <- data_bigbx1 %>%
ggplot(mapping = aes(x = as.factor(start), y = unqC_Sum)) +
geom_boxplot() +
geom_boxplot(data = high_boxplots1, aes(x = as.factor(start), y = unqC_Sum, fill = gene_ID, color = gene_ID),
show.legend = FALSE) +
geom_point(data = data_bigbx1 %>% filter(qtl_pos == 1), aes(x = as.factor(start), y = 17500), shape = 15,
size = 2.5, color = "#00A86B", fill = "#00A86B") +
annotate("label", label = "QTL region", y = 18500, x = "14891953", size = 3) +
#geom_rect(aes(xmin= as.factor("12875963"), xmax= as.factor("16344528"), ymin=0, ymax=Inf), alpha = 0.5) +
#geom_segment(aes(x = "12875963", y = 0, xend = "12875963", yend = 17500)) +
#scale_x_discrete(labels = levels(factor(data_bigbx1$gene_ID))) +
scale_x_discrete(labels = levels(factor(data_bigbx1$gene_ID)),
guide = guide_axis(check.overlap = TRUE)
) +
scale_fill_manual(values = colors) +
scale_color_manual(values = colors) +
ggrepel::geom_label_repel(data = gene_names1, aes(x = as.factor(start), y = max, label = gene_ID, fill = gene_ID),
hjust = -0.2,
show.legend = FALSE) +
labs(y = "Total Gene expression") +
theme_bw() +
theme(
#panel.grid.major = element_line(colour = 'grey85', size = 0.2),
#panel.grid.minor = element_line(colour = 'grey90', size = 0.1),
axis.title.x = element_blank(),
axis.title.y = element_text(size = 10),
#axis.title.y = element_blank(),
plot.title = element_text(size = 10),
axis.text.x = element_text(angle = 45, hjust=1),
#legend.position = c(0.8,0.2), #You can adjust legend position here
legend.background = element_rect(color = "black")
)
bx_big1
dev.off()
## quartz_off_screen
## 2
knitr::include_graphics("bx_big1.png")
grDevices::png(filename = "bx_big2.png", width = 16.5, height = 10, units = 'cm', res = 400)
bx_big2 <- data_bigbx2 %>%
ggplot(mapping = aes(x = as.factor(start), y = unqC_Sum)) +
geom_boxplot() +
geom_boxplot(data = high_boxplots2, aes(x = as.factor(start), y = unqC_Sum, fill = gene_ID, color = gene_ID),
show.legend = FALSE) +
geom_point(data = data_bigbx2 %>% filter(qtl_pos == 1), aes(x = as.factor(start), y = 17500), shape = 15,
size = 2.5, color = "#00A86B", fill = "#00A86B") +
annotate("label", label = "QTL region", y = 18500, x = "14795486", size = 3) +
#geom_rect(aes(xmin= as.factor("12875963"), xmax= as.factor("16344528"), ymin=0, ymax=Inf), alpha = 0.5) +
#geom_segment(aes(x = "12875963", y = 0, xend = "12875963", yend = 17500)) +
#scale_x_discrete(labels = levels(factor(data_bigbx1$gene_ID))) +
scale_x_discrete(labels = levels(factor(data_bigbx2$gene_ID)),
guide = guide_axis(check.overlap = TRUE)
) +
scale_fill_manual(values = colors) +
scale_color_manual(values = colors) +
ggrepel::geom_label_repel(data = gene_names2, aes(x = as.factor(start), y = max, label = gene_ID, fill = gene_ID),
hjust = -0.2,
show.legend = FALSE) +
labs(y = "Total Gene expression") +
theme_bw() +
theme(
#panel.grid.major = element_line(colour = 'grey85', size = 0.2),
#panel.grid.minor = element_line(colour = 'grey90', size = 0.1),
axis.title.x = element_blank(),
axis.title.y = element_text(size = 10),
#axis.title.y = element_blank(),
plot.title = element_text(size = 10),
axis.text.x = element_text(angle = 45, hjust=1),
#legend.position = c(0.8,0.2), #You can adjust legend position here
legend.background = element_rect(color = "black")
)
bx_big2
dev.off()
## quartz_off_screen
## 2
knitr::include_graphics("bx_big2.png")
grDevices::png(filename = "bx_big3.png", width = 16.5, height = 10, units = 'cm', res = 400)
bx_big3 <- data_bigbx3 %>%
ggplot(mapping = aes(x = as.factor(start), y = unqC_Sum)) +
geom_boxplot() +
geom_boxplot(data = high_boxplots3, aes(x = as.factor(start), y = unqC_Sum, fill = gene_ID, color = gene_ID),
show.legend = FALSE) +
geom_point(data = data_bigbx3 %>% filter(qtl_pos == 1), aes(x = as.factor(start), y = 10000), shape = 15,
size = 2.5, color = "#00A86B", fill = "#00A86B") +
annotate("label", label = "QTL region", y = 11000, x = "14635146", size = 3) +
#geom_rect(aes(xmin= as.factor("12875963"), xmax= as.factor("16344528"), ymin=0, ymax=Inf), alpha = 0.5) +
#geom_segment(aes(x = "12875963", y = 0, xend = "12875963", yend = 17500)) +
#scale_x_discrete(labels = levels(factor(data_bigbx1$gene_ID))) +
scale_x_discrete(labels = levels(factor(data_bigbx3$gene_ID)),
guide = guide_axis(check.overlap = TRUE)
) +
scale_fill_manual(values = colors) +
scale_color_manual(values = colors) +
ggrepel::geom_label_repel(data = gene_names3, aes(x = as.factor(start), y = max, label = gene_ID, fill = gene_ID),
hjust = -0.2,
show.legend = FALSE) +
labs(y = "Total Gene expression") +
theme_bw() +
theme(
#panel.grid.major = element_line(colour = 'grey85', size = 0.2),
#panel.grid.minor = element_line(colour = 'grey90', size = 0.1),
axis.title.x = element_blank(),
axis.title.y = element_text(size = 10),
#axis.title.y = element_blank(),
plot.title = element_text(size = 10),
axis.text.x = element_text(angle = 45, hjust=1),
#legend.position = c(0.8,0.2), #You can adjust legend position here
legend.background = element_rect(color = "black")
)
bx_big3
dev.off()
## quartz_off_screen
## 2
knitr::include_graphics("bx_big3.png")
result.withRegions <- result.withGeneMap.asDF %>%
mutate(region = case_when(
start < 12875963/2 ~ "1",
start < 12875963 ~ "2",
start < 16344528 ~ "QTL",
TRUE ~ "3"))
We treat gene as having significant difference if it is not from “Non-sig” category
Variables: * num_sig = 1 if gene expression difference is significant, 0 - otherwise * n - number of genes in the region * mean total - average gene expression (mean unqC_total) * sum_num_sig - number of significant genes in the region * perc_sig - percent of significant genes from the overall number of genes in the region
resultsRegionsCat <- result.withRegions %>%
mutate(num_sig = ifelse(Significance == "non-Sig",0,1)) %>%
group_by(region) %>%
summarise(n = n(),
mean_total = mean(unqC_total),
sum_num_sig = sum(num_sig)) %>%
mutate(perc_sig = sum_num_sig/n)
resultsRegionsCat
## # A tibble: 4 × 5
## region n mean_total sum_num_sig perc_sig
## <chr> <int> <dbl> <dbl> <dbl>
## 1 1 449 4079. 51 0.114
## 2 2 276 3682. 36 0.130
## 3 3 516 3693. 33 0.0640
## 4 QTL 520 3912. 47 0.0904
We treat gene as having significant difference if its adjusted p-value < 0.001
resultsRegionsPadj <- result.withRegions %>%
mutate(num_sig = ifelse(padj < 0.001 ,1,0)) %>%
group_by(region) %>%
summarise(n = n(),
mean_total = mean(unqC_total),
sum_num_sig = sum(num_sig)) %>%
mutate(perc_sig = sum_num_sig/n)
resultsRegionsPadj
## # A tibble: 4 × 5
## region n mean_total sum_num_sig perc_sig
## <chr> <int> <dbl> <dbl> <dbl>
## 1 1 449 4079. 56 0.125
## 2 2 276 3682. 31 0.112
## 3 3 516 3693. 53 0.103
## 4 QTL 520 3912. 56 0.108
Tests compare whether proportions in sample differ.
All the analysis checks the proportions of significant genes obtained by adjusted p-value (< 0.001)
check_prop <- resultsRegionsPadj$perc_sig[4] #Exected proportion
chisq.test(c(56, 449-56), p = c(check_prop, 1-check_prop))$p.value #Region 1
## [1] 0.2444053
chisq.test(c(31, 276-31), p = c(check_prop, 1-check_prop))$p.value #Region 2
## [1] 0.8041747
chisq.test(c(53, 516-53), p = c(check_prop, 1-check_prop))$p.value #Region 3
## [1] 0.7152144
There is no case when Chisq test p-value is less than 0.05. Hence, we cannot reject the null hypothesis about the 0 difference.
resultsOutQTL <- result.withRegions %>% filter(region != "QTL") %>%
mutate(num_sig = ifelse(padj < 0.001, 1, 0))
Random samples from this part do not take into account actual genomic positions. For each iteration of the loop random genes are taken to form a dataset. Then the proportion if significant genes is calculated and checked whether it significantly differs from the expected proportions (which is an actual proportion which we obtained for the QTL region).
Then we calculate a share of samples with significantly different proportions of significant genes.
# Prepare dataframe to take loop records.
df_test<- as.data.frame(matrix(nrow = 10000, ncol = 3))
df_test <- df_test %>%
dplyr::rename(num_sig = V1,
prop = V2,
p.value = V3)
set.seed(256)
for (i in 1:10000) {
df_i <- resultsOutQTL[sample(nrow(resultsOutQTL), 520), ]
n_sig <- sum(df_i$num_sig == 1)
n_unsig <- sum(df_i$num_sig == 0)
df_test$num_sig[i] <- n_sig
df_test$prop[i] <- n_sig/nrow(df_i)
df_test$p.value[i] <- chisq.test(c(n_sig, n_unsig), p = c(check_prop, 1 - check_prop))$p.value
}
1 - proportion of significant genes in QTL region is larger than in random sample 0 - otherwise
df_test_sig <- df_test %>% filter(p.value < 0.05) %>%
mutate(compare = ifelse(prop < check_prop, 1, 0))
df_test_sig %>% group_by(compare) %>%
summarise(n =n())
## # A tibble: 2 × 2
## compare n
## <dbl> <int>
## 1 0 236
## 2 1 17
Result: we can see that even if there was a signigicant difference in the proportion of significant genes, in most of the cases this proportion was larger than in QTL region.
grDevices::png(filename = "chi_hist.png", width = 12.5, height = 8, units = 'cm', res = 400)
chi_hist <- df_test %>%
ggplot(aes(x = prop)) +
geom_histogram(bins = 15, fill = "#58508d", alpha = 0.9) +
geom_vline(xintercept = check_prop, color = "red", size = 1) +
scale_x_continuous() +
annotate("label", label = "Expected proportion\n0.108",
x= 0.09 , y = 2000, size = 3) +
annotate("segment", x = 0.099, y = 2000,
xend = check_prop, yend = 2000,
arrow = arrow(length = unit(0.1, "cm"), type = "closed"),
lineend = "butt", linejoin = "mitre",
size = 0.6) +
theme_bw() +
theme(
panel.grid.major = element_line(colour = 'grey85', size = 0.2),
panel.grid.minor = element_line(colour = 'grey90', size = 0.1),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
plot.title = element_blank(),
axis.text.x = element_text(angle = 45, hjust=1),
#legend.position = c(0.8,0.2), #You can adjust legend position here
legend.background = element_rect(color = "black"),
legend.title = element_text(size = 8),
legend.text = element_text(size = 8)
)
chi_hist
dev.off()
## quartz_off_screen
## 2
knitr::include_graphics("chi_hist.png")
We can see that mostly the proportions of significant genes tend to be higher than proportion from the QTL region.
In this part we make samples of equal size which are go before the QTL region by genomic position. The region after the QTL has only 516 genes which were already check by the Chisq test. But we would like to know whether there is an interval of 520 genes which could have significantly different proportion of significant genes.
# Arrange genes outside of qtl region by genomic position
resultsSortedOutQTL <- resultsOutQTL %>%
arrange(start)
# Calculate the number of genes before the start of QTL region
check_rows <- sum(resultsSortedOutQTL$start < 12875963)
All in all there 206 samples of 520 genes before the QTL region.
# Prepare a dataframe for loop records
df_test_2<- as.data.frame(matrix(nrow = check_rows - 520 + 1, ncol = 3))
df_test_2 <- df_test_2 %>%
dplyr::rename(num_sig = V1,
prop = V2,
p.value = V3)
# Loop
for (i in 1:(check_rows-520+1)) {
df_i2 <- resultsSortedOutQTL[i:(i+519),]
n_sig2 <- sum(df_i2$num_sig == 1)
n_unsig2 <- sum(df_i2$num_sig == 0)
df_test_2$num_sig[i] <- n_sig2
df_test_2$prop[i] <- n_sig2/nrow(df_i2)
df_test_2$p.value[i] <- chisq.test(c(n_sig2, n_unsig2), p = c(check_prop, 1 - check_prop))$p.value
}
grDevices::png(filename = "chi_hist_2.png", width = 12.5, height = 8, units = 'cm', res = 400)
chi_hist_2 <- df_test_2 %>%
ggplot(aes(x = prop)) +
geom_histogram(#bins = 15,
fill = "#58508d", alpha = 0.9) +
geom_vline(xintercept = check_prop, color = "red", size = 1) +
scale_x_continuous(limits = c(0.085,0.13)) +
annotate("label", label = "Expected proportion\n0.108",
x= 0.0927 , y = 40, size = 3) +
annotate("segment", x = 0.099, y = 40,
xend = check_prop, yend = 40,
arrow = arrow(length = unit(0.1, "cm"), type = "closed"),
lineend = "butt", linejoin = "mitre",
size = 0.6) +
theme_bw() +
theme(
panel.grid.major = element_line(colour = 'grey85', size = 0.2),
panel.grid.minor = element_line(colour = 'grey90', size = 0.1),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
plot.title = element_blank(),
axis.text.x = element_text(angle = 45, hjust=1),
#legend.position = c(0.8,0.2), #You can adjust legend position here
legend.background = element_rect(color = "black"),
legend.title = element_text(size = 8),
legend.text = element_text(size = 8)
)
chi_hist_2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
dev.off()
## quartz_off_screen
## 2
knitr::include_graphics("chi_hist_2.png")
At the same time, we can cleaely see that in all cases the proportions of significant genes a higher, than that of QTL regions.
library(karyoploteR)
library(Biostrings)
lyr.chr <- fasta.index("lyrata_genome.fa") #%>% head(5)
typeof(lyr.chr)
## [1] "list"
lyr.chr <- lyr.chr %>%
dplyr::select(desc, seqlength) %>%
dplyr::mutate(chrom = base::strsplit(desc, " "))
# only select the first element of the mutant column
lyr.chr$chrom <- base::sapply(lyr.chr$chrom, `[`, 1)
# again select only required columns
lyr.chr <- lyr.chr %>%
dplyr::select(chrom, seqlength)
lyr.chr <- data.frame(lapply(lyr.chr, function(x) {
gsub("scaffold_", "", x)
}))
lyr.chr$chrom <- as.numeric(as.character(lyr.chr$chrom))
lyr.chr$seqlength <- as.numeric(as.character(lyr.chr$seqlength))
lyr.chr <- lyr.chr %>%
filter(as.numeric(chrom) < 9) %>%
arrange(chrom)
lyr.chr$chrom <- base::paste0("chr", lyr.chr$chrom)
lyr.custom.genome <- toGRanges(data.frame(
chr = c(lyr.chr$chrom), start = rep(1, length(lyr.chr$chrom)),
end = c(lyr.chr$seqlength)
))
lyr.custom.genome
## GRanges object with 8 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## 1 chr1 1-33132539 *
## 2 chr2 1-19320864 *
## 3 chr3 1-24464547 *
## 4 chr4 1-23328337 *
## 5 chr5 1-21221946 *
## 6 chr6 1-25113588 *
## 7 chr7 1-24649197 *
## 8 chr8 1-22951293 *
## -------
## seqinfo: 8 sequences from an unspecified genome; no seqlengths
gffRangedGenome.aly <- rtracklayer::import.gff("data/lyrata1.to9.Rawat.gff")
gffRangedGenome.aly %>% head(5)
## GRanges object with 5 ranges and 8 metadata columns:
## seqnames ranges strand | source type score phase
## <Rle> <IRanges> <Rle> | <factor> <factor> <numeric> <integer>
## [1] chr1 1-2541 - | version-2 gene 0.43 <NA>
## [2] chr1 1-2541 - | version-2 transcript 0.43 <NA>
## [3] chr1 62-144 - | version-2 intron 0.96 <NA>
## [4] chr1 253-406 - | version-2 intron 1.00 <NA>
## [5] chr1 783-1422 - | version-2 intron 1.00 <NA>
## ID Name Note Parent
## <character> <character> <CharacterList> <CharacterList>
## [1] AL1G10010 AL1G10010 Protein_Coding_gene
## [2] AL1G10010.t1 <NA> AL1G10010
## [3] AL1G10010.t1 <NA> AL1G10010
## [4] AL1G10010.t1 <NA> AL1G10010
## [5] AL1G10010.t1 <NA> AL1G10010
## -------
## seqinfo: 9 sequences from an unspecified genome; no seqlengths
aly.cytoBands <- as.data.frame(gffRangedGenome.aly) %>%
dplyr::filter(type == "gene") %>%
dplyr::select(seqnames, start, end, strand, width, Name)
# convert cytobands to GRanges object
aly.cytoBands <- toGRanges(aly.cytoBands)
kp.lyr <- plotKaryotype(genome = lyr.custom.genome)
library(AnnotationDbi)
library(GenomicFeatures)
txdb.aly <- makeTxDbFromGFF("data/lyrata1.to9.Rawat.gff", format = "gff",
dataSource = "Rawat et. al",
organism = "Arabidopsis lyrata")
## Import genomic features from the file as a GRanges object ... OK
## Prepare the 'metadata' data frame ... OK
## Make the TxDb object ...
## Warning in .get_cds_IDX(mcols0$type, mcols0$phase): The "phase" metadata column contains non-NA values for features of type
## stop_codon. This information was ignored.
## Warning in makeTxDbFromGRanges(gr, metadata = metadata): The following transcripts were dropped because their exon ranks could
## not be inferred (either because the exons are not on the same
## chromosome/strand or because they are not separated by introns):
## AL1G10010, AL1G10020, AL1G10030, AL1G10040, AL1G10050, AL1G10060,
## AL1G10070, AL1G10080, AL1G10090, AL1G10100, AL1G10110, AL1G10120,
## AL1G10130, AL1G10140, AL1G10150, AL1G10160, AL1G10170, AL1G10180,
## AL1G10190, AL1G10200, AL1G10210, AL1G10220, AL1G10230, AL1G10240,
## AL1G10250, AL1G10260, AL1G10270, AL1G10280, AL1G10290, AL1G10300,
## AL1G10310, AL1G10320, AL1G10330, AL1G10340, AL1G10350, AL1G10360,
## AL1G10370, AL1G10380, AL1G10390, AL1G10400, AL1G10410, AL1G10420,
## AL1G10430, AL1G10450, AL1G10470, AL1G10480, AL1G10490, AL1G10500,
## AL1G10510, AL1G10520, AL1G10530, AL1G10540, AL1G10550, AL1G10560,
## AL1G10570, AL1G10580, AL1G10590, AL1G10600, AL1G10610, AL1G10620,
## AL1G10630, AL1G10640, AL1G10650, AL1G10660, AL1G10670, AL1G10680,
## AL1G10690, AL1G10700, AL1G10710, AL1G10720, AL1G10740, AL1G10750,
## AL1G10760, AL1G10770, AL1G10780, AL1G10790, AL1G10800, AL1G10810,
## AL1G10820, AL1G10830, AL1G10840, AL1G10850, AL1G10860, AL1G10870,
## AL1G10880, AL1G10890, AL1G10900, AL1G10910, AL1G10920, AL1G10930,
## AL1G10940, AL1G10950, AL1G10960, AL1G10970, AL1G10980, AL1G10990,
## AL1G11000, AL1G11010, AL1G11020, AL1G11030, AL1G11040, AL1G11050,
## AL1G11060, AL1G11070, AL1G11080, AL1G11090, AL1G11100, AL1G11110,
## AL1G11120, AL1G11130, AL1G11140, AL1G11150, AL1G11160, AL1G11170,
## AL1G11180, AL1G11200, AL1G11210, AL1G11220, AL1G11230, AL1G11240,
## AL1G11250, AL1G11260, AL1G11270, AL1G11280, AL1G11290, AL1G11310,
## AL1G11320, AL1G11330, AL1G11340, AL1G11350, AL1G11360, AL1G11370,
## AL1G11380, AL1G11390, AL1G11400, AL1G11410, AL1G11420, AL1G11430,
## AL1G11440, AL1G11450, AL1G11460, AL1G11470, AL1G11480, AL1G11500,
## AL1G11510, AL1G11520, AL1G11530, AL1G11540, AL1G11550, AL1G11560,
## AL1G11570, AL1G11580, AL1G11590, AL1G11600, AL1G11610, AL1G11620,
## AL1G11630, AL1G11640, AL1G11650, AL1G11660, AL1G11670, AL1G11680,
## AL1G11690, AL1G11700, AL1G11710, AL1G11720, AL1G11730, AL1G11740,
## AL1G11750, AL1G11760, AL1G11770, AL1G11780, AL1G11790, AL1G11800,
## AL1G11810, AL1G11820, AL1G11840, AL1G11850, AL1G11860, AL1G11870,
## AL1G11880, AL1G11890, AL1G11900, AL1G11910, AL1G11930, AL1G11940,
## AL1G11960, AL1G11970, AL1G11980, AL1G11990, AL1G12000, AL1G12010,
## AL1G12020, AL1G12030, AL1G12040, AL1G12050, AL1G12060, AL1G12070,
## AL1G12080, AL1G12090, AL1G12100, AL1G12110, AL1G12120, AL1G12130,
## AL1G12150, AL1G12160, AL1G12170, AL1G12180, AL1G12190, AL1G12200,
## AL1G12210, AL1G12220, AL1G12230, AL1G12240, AL1G12260, AL1G12270,
## AL1G12280, AL1G12290, AL1G12300, AL1G12310, AL1G12320, AL1G12330,
## AL1G12340, AL1G12350, AL1G12360, AL1G12370, AL1G12380, AL1G12390,
## AL1G12400, AL1G12410, AL1G12420, AL1G12430, AL1G12440, AL1G12450,
## AL1G12460, AL1G12470, AL1G12480, AL1G12490, AL1G12500, AL1G12510,
## AL1G12520, AL1G12530, AL1G12540, AL1G12550, AL1G12560, AL1G12570,
## AL1G12580, AL1G12590, AL1G12600, AL1G12610, AL1G12620, AL1G12630,
## AL1G12640, AL1G12650, AL1G12660, AL1G12670, AL1G12680, AL1G12690,
## AL1G12700, AL1G12710, AL1G12720, AL1G12730, AL1G12740, AL1G12750,
## AL1G12760, AL1G12770, AL1G12780, AL1G12790, AL1G12800, AL1G12810,
## AL1G12820, AL1G12830, AL1G12840, AL1G12850, AL1G12860, AL1G12870,
## AL1G12880, AL1G12890, AL1G12900, AL1G12910, AL1G12920, AL1G12930,
## AL1G12940, AL1G12950, AL1G12960, AL1G12970, AL1G12980, AL1G12990,
## AL1G13000, AL1G13010, AL1G13030, AL1G13040, AL1G13050, AL1G13060,
## AL1G13070, AL1G13090, AL1G13100, AL1G13110, AL1G13120, AL1G13130,
## AL1G13150, AL1G13160, AL1G13170, AL1G13180, AL1G13190, AL1G13200,
## AL1G13210, AL1G13220, AL1G13230, AL1G13240, AL1G13250, AL1G13260,
## AL1G13270, AL1G13280, AL1G13290, AL1G13300, AL1G13310, AL1G13320,
## AL1G13330, AL1G13340, AL1G13350, AL1G13360, AL1G13370, AL1G13380,
## AL1G13390, AL1G13400, AL1G13410, AL1G13430, AL1G13440, AL1G13450,
## AL1G13460, AL1G13470, AL1G13480, AL1G13490, AL1G13500, AL1G13510,
## AL1G13520, AL1G13530, AL1G13540, AL1G13550, AL1G13560, AL1G13570,
## AL1G13580, AL1G13590, AL1G13600, AL1G13610, AL1G13620, AL1G13630,
## AL1G13640, AL1G13650, AL1G13660, AL1G13670, AL1G13680, AL1G13690,
## AL1G13700, AL1G13710, AL1G13720, AL1G13730, AL1G13740, AL1G13750,
## AL1G13760, AL1G13770, AL1G13780, AL1G13790, AL1G13800, AL1G13810,
## AL1G13820, AL1G13830, AL1G13840, AL1G13850, AL1G13860, AL1G13870,
## AL1G13880, AL1G13890, AL1G13900, AL1G13910, AL1G13930, AL1G13940,
## AL1G13950, AL1G13960, AL1G13970, AL1G13980, AL1G13990, AL1G14000,
## AL1G14010, AL1G14020, AL1G14030, AL1G14040, AL1G14050, AL1G14060,
## AL1G14070, AL1G14080, AL1G14090, AL1G14100, AL1G14110, AL1G14120,
## AL1G14130, AL1G14140, AL1G14150, AL1G14160, AL1G14170, AL1G14180,
## AL1G14190, AL1G14200, AL1G14210, AL1G14220, AL1G14230, AL1G14240,
## AL1G14250, AL1G14260, AL1G14270, AL1G14280, AL1G14290, AL1G14300,
## AL1G14310, AL1G14320, AL1G14330, AL1G14340, AL1G14350, AL1G14360,
## AL1G14370, AL1G14380, AL1G14390, AL1G14400, AL1G14410, AL1G14420,
## AL1G14430, AL1G14450, AL1G14460, AL1G14470, AL1G14480, AL1G14490,
## AL1G14500, AL1G14510, AL1G14520, AL1G14530, AL1G14540, AL1G14550,
## AL1G14560, AL1G14570, AL1G14580, AL1G14590, AL1G14600, AL1G14610,
## AL1G14620, AL1G14630, AL1G14640, AL1G14650, AL1G14660, AL1G14670,
## AL1G14680, AL1G14690, AL1G14700, AL1G14710, AL1G14720, AL1G14730,
## AL1G14740, AL1G14750, AL1G14760, AL1G14770, AL1G14780, AL1G14790,
## AL1G14800, AL1G14810, AL1G14820, AL1G14830, AL1G14840, AL1G14850,
## AL1G14860, AL1G14870, AL1G14880, AL1G14890, AL1G14900, AL1G14910,
## AL1G14920, AL1G14930, AL1G14940, AL1G14950, AL1G14960, AL1G14970,
## AL1G14980, AL1G14990, AL1G15000, AL1G15010, AL1G15020, AL1G15030,
## AL1G15040, AL1G15050, AL1G15060, AL1G15070, AL1G15080, AL1G15090,
## AL1G15100, AL1G15110, AL1G15120, AL1G15130, AL1G15140, AL1G15150,
## AL1G15160, AL1G15170, AL1G15180, AL1G15190, AL1G15210, AL1G15220,
## AL1G15240, AL1G15250, AL1G15260, AL1G15270, AL1G15280, AL1G15290,
## AL1G15300, AL1G15310, AL1G15320, AL1G15330, AL1G15340, AL1G15350,
## AL1G15360, AL1G15370, AL1G15380, AL1G15390, AL1G15400, AL1G15410,
## AL1G15420, AL1G15430, AL1G15440, AL1G15450, AL1G15460, AL1G15470,
## AL1G15480, AL1G15490, AL1G15500, AL1G15510, AL1G15520, AL1G15530,
## AL1G15540, AL1G15550, AL1G15560, AL1G15570, AL1G15580, AL1G15590,
## AL1G15600, AL1G15610, AL1G15620, AL1G15630, AL1G15640, AL1G15650,
## AL1G15660, AL1G15670, AL1G15680, AL1G15700, AL1G15710, AL1G15720,
## AL1G15730, AL1G15740, AL1G15750, AL1G15760, AL1G15770, AL1G15780,
## AL1G15790, AL1G15800, AL1G15810, AL1G15820, AL1G15830, AL1G15840,
## AL1G15850, AL1G15860, AL1G15870, AL1G15880, AL1G15890, AL1G15900,
## AL1G15910, AL1G15920, AL1G15940, AL1G15950, AL1G15960, AL1G15970,
## AL1G15980, AL1G15990, AL1G16000, AL1G16010, AL1G16020, AL1G16030,
## AL1G16040, AL1G16050, AL1G16060, AL1G16070, AL1G16080, AL1G16100,
## AL1G16110, AL1G16120, AL1G16130, AL1G16140, AL1G16150, AL1G16160,
## AL1G16170, AL1G16180, AL1G16190, AL1G16200, AL1G16210, AL1G16220,
## AL1G16230, AL1G16240, AL1G16250, AL1G16260, AL1G16270, AL1G16280,
## AL1G16290, AL1G16300, AL1G16310, AL1G16320, AL1G16330, AL1G16340,
## AL1G16350, AL1G16360, AL1G16370, AL1G16380, AL1G16390, AL1G16400,
## AL1G16410, AL1G16420, AL1G16430, AL1G16440, AL1G16450, AL1G16460,
## AL1G16470, AL1G16480, AL1G16490, AL1G16510, AL1G16520, AL1G16530,
## AL1G16540, AL1G16550, AL1G16560, AL1G16570, AL1G16580, AL1G16590,
## AL1G16600, AL1G16610, AL1G16620, AL1G16630, AL1G16640, AL1G16650,
## AL1G16660, AL1G16670, AL1G16680, AL1G16690, AL1G16700, AL1G16710,
## AL1G16720, AL1G16730, AL1G16740, AL1G16750, AL1G16760, AL1G16770,
## AL1G16780, AL1G16790, AL1G16800, AL1G16810, AL1G16820, AL1G16830,
## AL1G16840, AL1G16850, AL1G16860, AL1G16870, AL1G16890, AL1G16910,
## AL1G16920, AL1G16930, AL1G16940, AL1G16950, AL1G16960, AL1G16970,
## AL1G16980, AL1G16990, AL1G17000, AL1G17010, AL1G17020, AL1G17030,
## AL1G17040, AL1G17050, AL1G17060, AL1G17070, AL1G17080, AL1G17090,
## AL1G17100, AL1G17110, AL1G17120, AL1G17140, AL1G17150, AL1G17160,
## AL1G17180, AL1G17190, AL1G17200, AL1G17210, AL1G17230, AL1G17240,
## AL1G17250, AL1G17260, AL1G17270, AL1G17280, AL1G17290, AL1G17300,
## AL1G17310, AL1G17320, AL1G17330
## OK
txdb.aly
## TxDb object:
## # Db type: TxDb
## # Supporting package: GenomicFeatures
## # Data source: Rawat et. al
## # Organism: Arabidopsis lyrata
## # Taxonomy ID: 59689
## # miRBase build ID: NA
## # Genome: NA
## # Nb of transcripts: 32868
## # Db created by: GenomicFeatures package from Bioconductor
## # Creation time: 2022-04-11 19:10:29 +0500 (Mon, 11 Apr 2022)
## # GenomicFeatures version at creation time: 1.46.5
## # RSQLite version at creation time: 2.2.10
## # DBSCHEMAVERSION: 1.2
columns(txdb.aly)
## [1] "CDSCHROM" "CDSEND" "CDSID" "CDSNAME" "CDSPHASE"
## [6] "CDSSTART" "CDSSTRAND" "EXONCHROM" "EXONEND" "EXONID"
## [11] "EXONNAME" "EXONRANK" "EXONSTART" "EXONSTRAND" "GENEID"
## [16] "TXCHROM" "TXEND" "TXID" "TXNAME" "TXSTART"
## [21] "TXSTRAND" "TXTYPE"
summary(txdb.aly)
## Length Class Mode
## 1 TxDb S4
metadata(txdb.aly)
## name
## 1 Db type
## 2 Supporting package
## 3 Data source
## 4 Organism
## 5 Taxonomy ID
## 6 miRBase build ID
## 7 Genome
## 8 Nb of transcripts
## 9 Db created by
## 10 Creation time
## 11 GenomicFeatures version at creation time
## 12 RSQLite version at creation time
## 13 DBSCHEMAVERSION
## value
## 1 TxDb
## 2 GenomicFeatures
## 3 Rawat et. al
## 4 Arabidopsis lyrata
## 5 59689
## 6 <NA>
## 7 <NA>
## 8 32868
## 9 GenomicFeatures package from Bioconductor
## 10 2022-04-11 19:10:29 +0500 (Mon, 11 Apr 2022)
## 11 1.46.5
## 12 2.2.10
## 13 1.2
head(seqlevels(txdb.aly), 15)
## [1] "chr1" "chr2" "chr3" "chr4" "chr5" "chr6" "chr7" "chr8" "Chr9"
genes.aly <- genes(txdb.aly)
genes.aly
## GRanges object with 30142 ranges and 1 metadata column:
## seqnames ranges strand | gene_id
## <Rle> <IRanges> <Rle> | <character>
## AL1G10010 chr1 1-2541 - | AL1G10010
## AL1G10020 chr1 3306-6161 - | AL1G10020
## AL1G10040 chr1 9476-10535 + | AL1G10040
## AL1G10050 chr1 10548-11941 - | AL1G10050
## AL1G10060 chr1 12235-13421 - | AL1G10060
## ... ... ... ... . ...
## AL9U12200 Chr9 1791630-1793287 + | AL9U12200
## AL9U12210 Chr9 1792941-1795545 - | AL9U12210
## AL9U12220 Chr9 1796870-1801565 - | AL9U12220
## AL9U12230 Chr9 1810180-1812874 + | AL9U12230
## AL9U12240 Chr9 1836100-1837535 - | AL9U12240
## -------
## seqinfo: 9 sequences from an unspecified genome; no seqlengths
head(genes.aly,15)
## GRanges object with 15 ranges and 1 metadata column:
## seqnames ranges strand | gene_id
## <Rle> <IRanges> <Rle> | <character>
## AL1G10010 chr1 1-2541 - | AL1G10010
## AL1G10020 chr1 3306-6161 - | AL1G10020
## AL1G10040 chr1 9476-10535 + | AL1G10040
## AL1G10050 chr1 10548-11941 - | AL1G10050
## AL1G10060 chr1 12235-13421 - | AL1G10060
## ... ... ... ... . ...
## AL1G10120 chr1 39834-42611 - | AL1G10120
## AL1G10130 chr1 42668-46181 - | AL1G10130
## AL1G10140 chr1 46346-48809 + | AL1G10140
## AL1G10160 chr1 49457-51261 - | AL1G10160
## AL1G10170 chr1 51346-52761 - | AL1G10170
## -------
## seqinfo: 9 sequences from an unspecified genome; no seqlengths
result.withGeneID.asDF.withRowNames <- result.withGeneMap.asDF
rownames(result.withGeneID.asDF.withRowNames) <-
result.withGeneID.asDF.withRowNames$gene_ID
mcols(genes.aly) <- result.withGeneID.asDF.withRowNames[
names(genes.aly),
c("log2FoldChange", "stat", "pvalue", "padj",
"gene_ID", "gene_Name", "Significance")]
genes.aly.ordered <- genes.aly[base::order(genes.aly$padj, na.last = TRUE), ]
filtered.genes.aly <- genes.aly[!is.na(genes.aly$padj)]
log.pval <- -log10(filtered.genes.aly$padj)
mcols(filtered.genes.aly)$log.pval <- log.pval
sign.genes <- filtered.genes.aly[filtered.genes.aly$padj < 0.05,]
interest.genes <- filtered.genes.aly[filtered.genes.aly$gene_Name %in% list("PIN1", "PIN3", "BRC2")]
top.genes <- genes.aly.ordered[1:6]
filtered.genes.aly[filtered.genes.aly$gene_Name == "BRC2"]
## GRanges object with 0 ranges and 8 metadata columns:
## seqnames ranges strand | log2FoldChange stat pvalue padj
## <Rle> <IRanges> <Rle> | <numeric> <numeric> <numeric> <numeric>
## gene_ID gene_Name Significance log.pval
## <character> <character> <character> <numeric>
## -------
## seqinfo: 9 sequences from an unspecified genome; no seqlengths
fc.ymax <- ceiling(max(abs(range(sign.genes$log2FoldChange))))
fc.ymin <- -fc.ymax
cex.val <- sqrt(sign.genes$log.pval)/3
points.top <- 0.8
# To color the dots
col.over <- "#E63946"
col.under <- "#457b92"
sign.col <- rep(col.over, length(sign.genes))
sign.col[sign.genes$log2FoldChange<0] <- col.under
gene.mean <- start(top.genes) + (end(top.genes) - start(top.genes))/2
grDevices::png(filename = "kp_lyr.png", width = 16.5, height = 10, units = 'cm', res = 400)
# Base Karyoplot
kp.lyr <- plotKaryotype(
genome= lyr.custom.genome, cytobands = aly.cytoBands,
chromosomes = "chr2",
plot.type=2)
## Warning in ideogram.plotter(kp, ...): No 'gieStain' column found in cytobands.
## Using 'gpos50' (gray) for all of them
# Add points
kp.lyr <- kpPoints(kp.lyr, data=sign.genes, y=sign.genes$log2FoldChange, cex=cex.val, ymax=fc.ymax, ymin=fc.ymin, col=sign.col,
#r1=points.top)
)
# Add y-axis
kpAxis(kp.lyr, ymax=fc.ymax, ymin=fc.ymin,
r1=points.top, cex = 0.8
)
# Add name of y label axis
kpAddLabels(kp.lyr, labels = "log2 Fold Change", srt=90, pos=1, label.margin = 0.07, ymax=fc.ymax, ymin=fc.ymin,
r1=points.top, cex = 0.8
)
## Warning in text.default(x = x, y = y, labels = labels, pos = pos, offset = 0, :
## "ymax" is not a graphical parameter
## Warning in text.default(x = x, y = y, labels = labels, pos = pos, offset = 0, :
## "ymin" is not a graphical parameter
gene.mean <- start(top.genes) + (end(top.genes) - start(top.genes))/2
kpSegments(kp.lyr, chr=as.character(seqnames(top.genes)), x0=gene.mean, x1=gene.mean, y0=top.genes$log2FoldChange, y1=fc.ymax, ymax=fc.ymax, ymin=fc.ymin,
r1=points.top
)
# Text markers
kpPlotMarkers(kp.lyr, top.genes, labels = names(top.genes), text.orientation = "horizontal",
r0=points.top, label.dist = 0.003, cex = 0.8
)
kpPlotDensity(kp.lyr, data=filtered.genes.aly, window.size = 10e4, data.panel = 2,
r1 = 0.8)
kp.lyr
## $plot.params
## $plot.params$leftmargin
## [1] 0.1
##
## $plot.params$rightmargin
## [1] 0.05
##
## $plot.params$topmargin
## [1] 120
##
## $plot.params$bottommargin
## [1] 100
##
## $plot.params$ideogramheight
## [1] 50
##
## $plot.params$ideogramlateralmargin
## [1] 0
##
## $plot.params$data1height
## [1] 200
##
## $plot.params$data1inmargin
## [1] 20
##
## $plot.params$data1outmargin
## [1] 20
##
## $plot.params$data1min
## [1] 0
##
## $plot.params$data1max
## [1] 1
##
## $plot.params$data2height
## [1] 200
##
## $plot.params$data2inmargin
## [1] 20
##
## $plot.params$data2outmargin
## [1] 20
##
## $plot.params$data2min
## [1] 0
##
## $plot.params$data2max
## [1] 1
##
## $plot.params$dataideogrammin
## [1] 0
##
## $plot.params$dataideogrammax
## [1] 1
##
## $plot.params$dataallmin
## [1] 0
##
## $plot.params$dataallmax
## [1] 1
##
##
## $genome.name
## [1] "custom"
##
## $genome
## GRanges object with 1 range and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## chr2 chr2 1-19320864 *
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
##
## $cytobands
## GRanges object with 32285 ranges and 3 metadata columns:
## seqnames ranges strand | strand width Name
## <Rle> <IRanges> <Rle> | <factor> <integer> <character>
## 1 chr1 1-2541 * | - 2541 AL1G10010
## 2 chr1 3306-6161 * | - 2856 AL1G10020
## 3 chr1 7429-7630 * | + 202 AL1G10030
## 4 chr1 9476-10535 * | + 1060 AL1G10040
## 5 chr1 10548-11941 * | - 1394 AL1G10050
## ... ... ... ... . ... ... ...
## 32281 Chr9 1791630-1793287 * | + 1658 AL9U12200
## 32282 Chr9 1792941-1795545 * | - 2605 AL9U12210
## 32283 Chr9 1796870-1801565 * | - 4696 AL9U12220
## 32284 Chr9 1810180-1812874 * | + 2695 AL9U12230
## 32285 Chr9 1836100-1837535 * | - 1436 AL9U12240
## -------
## seqinfo: 9 sequences from an unspecified genome; no seqlengths
##
## $plot.type
## [1] 2
##
## $plot.region
## GRanges object with 1 range and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## chr2 chr2 1-19320864 *
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
##
## $zoom
## [1] FALSE
##
## $chromosomes
## [1] "chr2"
##
## $chromosome.lengths
## chr2
## 19320864
##
## $coord.change.function
## function (chr = NULL, x = NULL, y = NULL, data.panel = NULL)
## {
## if (is.null(data.panel))
## stop("In coordChangeFunction: data.panel is required")
## if (!is.null(y)) {
## if (is.null(chr)) {
## stop("If y is not NULL, chr must be specified too")
## }
## if (length(y) != length(chr)) {
## stop("If y is not NULL, it have to have the same length as chr")
## }
## }
## return(genomic2plot(chr = chr, x = x, y = y, data.panel = data.panel,
## genome = genome, plot.params = plot.params))
## }
## <bytecode: 0x7fa80a67a6a0>
## <environment: 0x7fa80aeb7778>
##
## $ideogram.mid
## function (chr)
## {
## return(ideoMid(chr = chr, genome = genome, plot.params = plot.params))
## }
## <bytecode: 0x7fa80a673938>
## <environment: 0x7fa80aeb7778>
##
## $chromosome.height
## [1] 530
##
## $graphical.par
## $graphical.par$old.par
## $graphical.par$old.par$xlog
## [1] FALSE
##
## $graphical.par$old.par$ylog
## [1] FALSE
##
## $graphical.par$old.par$adj
## [1] 0.5
##
## $graphical.par$old.par$ann
## [1] TRUE
##
## $graphical.par$old.par$ask
## [1] FALSE
##
## $graphical.par$old.par$bg
## [1] "white"
##
## $graphical.par$old.par$bty
## [1] "o"
##
## $graphical.par$old.par$cex
## [1] 1
##
## $graphical.par$old.par$cex.axis
## [1] 1
##
## $graphical.par$old.par$cex.lab
## [1] 1
##
## $graphical.par$old.par$cex.main
## [1] 1.2
##
## $graphical.par$old.par$cex.sub
## [1] 1
##
## $graphical.par$old.par$col
## [1] "black"
##
## $graphical.par$old.par$col.axis
## [1] "black"
##
## $graphical.par$old.par$col.lab
## [1] "black"
##
## $graphical.par$old.par$col.main
## [1] "black"
##
## $graphical.par$old.par$col.sub
## [1] "black"
##
## $graphical.par$old.par$crt
## [1] 0
##
## $graphical.par$old.par$err
## [1] 0
##
## $graphical.par$old.par$family
## [1] ""
##
## $graphical.par$old.par$fg
## [1] "black"
##
## $graphical.par$old.par$fig
## [1] 0 1 0 1
##
## $graphical.par$old.par$fin
## [1] 6.496063 3.937008
##
## $graphical.par$old.par$font
## [1] 1
##
## $graphical.par$old.par$font.axis
## [1] 1
##
## $graphical.par$old.par$font.lab
## [1] 1
##
## $graphical.par$old.par$font.main
## [1] 2
##
## $graphical.par$old.par$font.sub
## [1] 1
##
## $graphical.par$old.par$lab
## [1] 5 5 7
##
## $graphical.par$old.par$las
## [1] 0
##
## $graphical.par$old.par$lend
## [1] "round"
##
## $graphical.par$old.par$lheight
## [1] 1
##
## $graphical.par$old.par$ljoin
## [1] "round"
##
## $graphical.par$old.par$lmitre
## [1] 10
##
## $graphical.par$old.par$lty
## [1] "solid"
##
## $graphical.par$old.par$lwd
## [1] 1
##
## $graphical.par$old.par$mai
## [1] 1.02 0.82 0.82 0.42
##
## $graphical.par$old.par$mar
## [1] 5.1 4.1 4.1 2.1
##
## $graphical.par$old.par$mex
## [1] 1
##
## $graphical.par$old.par$mfcol
## [1] 1 1
##
## $graphical.par$old.par$mfg
## [1] 1 1 1 1
##
## $graphical.par$old.par$mfrow
## [1] 1 1
##
## $graphical.par$old.par$mgp
## [1] 3 1 0
##
## $graphical.par$old.par$mkh
## [1] 0.001
##
## $graphical.par$old.par$new
## [1] FALSE
##
## $graphical.par$old.par$oma
## [1] 0 0 0 0
##
## $graphical.par$old.par$omd
## [1] 0 1 0 1
##
## $graphical.par$old.par$omi
## [1] 0 0 0 0
##
## $graphical.par$old.par$pch
## [1] 1
##
## $graphical.par$old.par$pin
## [1] 5.256063 2.097008
##
## $graphical.par$old.par$plt
## [1] 0.1262303 0.9353455 0.2590800 0.7917200
##
## $graphical.par$old.par$ps
## [1] 12
##
## $graphical.par$old.par$pty
## [1] "m"
##
## $graphical.par$old.par$smo
## [1] 1
##
## $graphical.par$old.par$srt
## [1] 0
##
## $graphical.par$old.par$tck
## [1] NA
##
## $graphical.par$old.par$tcl
## [1] -0.5
##
## $graphical.par$old.par$usr
## [1] 0 1 0 1
##
## $graphical.par$old.par$xaxp
## [1] 0 1 5
##
## $graphical.par$old.par$xaxs
## [1] "r"
##
## $graphical.par$old.par$xaxt
## [1] "s"
##
## $graphical.par$old.par$xpd
## [1] FALSE
##
## $graphical.par$old.par$yaxp
## [1] 0 1 5
##
## $graphical.par$old.par$yaxs
## [1] "r"
##
## $graphical.par$old.par$yaxt
## [1] "s"
##
## $graphical.par$old.par$ylbias
## [1] 0.2
##
##
## $graphical.par$new.par
## $graphical.par$new.par$xlog
## [1] FALSE
##
## $graphical.par$new.par$ylog
## [1] FALSE
##
## $graphical.par$new.par$adj
## [1] 0.5
##
## $graphical.par$new.par$ann
## [1] TRUE
##
## $graphical.par$new.par$ask
## [1] FALSE
##
## $graphical.par$new.par$bg
## [1] "white"
##
## $graphical.par$new.par$bty
## [1] "o"
##
## $graphical.par$new.par$cex
## [1] 1
##
## $graphical.par$new.par$cex.axis
## [1] 1
##
## $graphical.par$new.par$cex.lab
## [1] 1
##
## $graphical.par$new.par$cex.main
## [1] 1.2
##
## $graphical.par$new.par$cex.sub
## [1] 1
##
## $graphical.par$new.par$col
## [1] "black"
##
## $graphical.par$new.par$col.axis
## [1] "black"
##
## $graphical.par$new.par$col.lab
## [1] "black"
##
## $graphical.par$new.par$col.main
## [1] "black"
##
## $graphical.par$new.par$col.sub
## [1] "black"
##
## $graphical.par$new.par$crt
## [1] 0
##
## $graphical.par$new.par$err
## [1] 0
##
## $graphical.par$new.par$family
## [1] ""
##
## $graphical.par$new.par$fg
## [1] "black"
##
## $graphical.par$new.par$fig
## [1] 0 1 0 1
##
## $graphical.par$new.par$fin
## [1] 6.496063 3.937008
##
## $graphical.par$new.par$font
## [1] 1
##
## $graphical.par$new.par$font.axis
## [1] 1
##
## $graphical.par$new.par$font.lab
## [1] 1
##
## $graphical.par$new.par$font.main
## [1] 2
##
## $graphical.par$new.par$font.sub
## [1] 1
##
## $graphical.par$new.par$lab
## [1] 5 5 7
##
## $graphical.par$new.par$las
## [1] 0
##
## $graphical.par$new.par$lend
## [1] "round"
##
## $graphical.par$new.par$lheight
## [1] 1
##
## $graphical.par$new.par$ljoin
## [1] "round"
##
## $graphical.par$new.par$lmitre
## [1] 10
##
## $graphical.par$new.par$lty
## [1] "solid"
##
## $graphical.par$new.par$lwd
## [1] 1
##
## $graphical.par$new.par$mai
## [1] 0.02 0.02 0.02 0.02
##
## $graphical.par$new.par$mar
## [1] 0.1 0.1 0.1 0.1
##
## $graphical.par$new.par$mex
## [1] 1
##
## $graphical.par$new.par$mfcol
## [1] 1 1
##
## $graphical.par$new.par$mfg
## [1] 1 1 1 1
##
## $graphical.par$new.par$mfrow
## [1] 1 1
##
## $graphical.par$new.par$mgp
## [1] 3 1 0
##
## $graphical.par$new.par$mkh
## [1] 0.001
##
## $graphical.par$new.par$new
## [1] FALSE
##
## $graphical.par$new.par$oma
## [1] 0 0 0 0
##
## $graphical.par$new.par$omd
## [1] 0 1 0 1
##
## $graphical.par$new.par$omi
## [1] 0 0 0 0
##
## $graphical.par$new.par$pch
## [1] 1
##
## $graphical.par$new.par$pin
## [1] 6.456063 3.897008
##
## $graphical.par$new.par$plt
## [1] 0.003078788 0.996921212 0.005080000 0.994920000
##
## $graphical.par$new.par$ps
## [1] 12
##
## $graphical.par$new.par$pty
## [1] "m"
##
## $graphical.par$new.par$smo
## [1] 1
##
## $graphical.par$new.par$srt
## [1] 0
##
## $graphical.par$new.par$tck
## [1] NA
##
## $graphical.par$new.par$tcl
## [1] -0.5
##
## $graphical.par$new.par$usr
## [1] 0 1 0 750
##
## $graphical.par$new.par$xaxp
## [1] 0 1 5
##
## $graphical.par$new.par$xaxs
## [1] "r"
##
## $graphical.par$new.par$xaxt
## [1] "s"
##
## $graphical.par$new.par$xpd
## [1] FALSE
##
## $graphical.par$new.par$yaxp
## [1] 0 700 7
##
## $graphical.par$new.par$yaxs
## [1] "r"
##
## $graphical.par$new.par$yaxt
## [1] "s"
##
## $graphical.par$new.par$ylbias
## [1] 0.2
##
##
##
## $beginKpPlot
## function ()
## {
## graphics::par(kp$graphical.par$new.par)
## }
## <bytecode: 0x7fa80a7a5820>
## <environment: 0x7fa80b8df1a8>
##
## $endKpPlot
## function ()
## {
## graphics::par(kp$graphical.par$old.par)
## }
## <bytecode: 0x7fa80a7a5d98>
## <environment: 0x7fa80b8df1a8>
##
## $plot
## $plot$xmin
## [1] 0
##
## $plot$xmax
## [1] 1
##
## $plot$ymin
## [1] 0
##
## $plot$ymax
## [1] 750
##
##
## attr(,"class")
## [1] "KaryoPlot"
dev.off()
## quartz_off_screen
## 2
knitr::include_graphics("kp_lyr.png")